1use std::fmt;
15use std::iter::Peekable;
16use std::str::Chars;
17
18#[derive(Debug, Clone, PartialEq)]
20pub enum Token {
21 Select,
23 From,
24 Where,
25 And,
26 Or,
27 Not,
28 Match,
29 Return,
30 Join,
31 Graph,
32 Path,
33 To,
34 Via,
35 On,
36 As,
37 Is,
38 Null,
39 Between,
40 Like,
41 In,
42 Order,
43 By,
44 Asc,
45 Desc,
46 Nulls,
47 First,
48 Last,
49 Limit,
50 Offset,
51 Inner,
52 Left,
53 Right,
54 Outer,
55 Full,
56 Cross,
57 Starts,
58 Ends,
59 With,
60 Contains,
61 True,
62 False,
63 Enrich,
64 Group,
65 Count,
66 Sum,
67 Avg,
68 Min,
69 Max,
70 Distinct,
71
72 Vector,
74 Search,
75 Similar,
76 Collection,
77 Metric,
78 Threshold,
79 K,
80 Hybrid,
81 Fusion,
82 Rerank,
83 Rrf,
84 Intersection,
85 Union,
86 Recursive,
87 All,
88 Weight,
89 L2,
90 Cosine,
91 InnerProduct,
92 Include,
93 Metadata,
94 Vectors,
95
96 Insert,
98 Into,
99 Values,
100 Update,
101 Set,
102 Delete,
103 Truncate,
104 Create,
105 Table,
106 Drop,
107 Alter,
108 Add,
109 Column,
110 Primary,
111 Explain,
113 For,
114 Format,
115 Json,
116 Key,
117 Default,
118 Compress,
119 Index,
120 Unique,
121 If,
122 Exists,
123 Returning,
124 Cascade,
125 Rename,
126 Using,
127
128 Node,
130 Edge,
131 Document,
132 Kv,
133
134 Timeseries,
136 Retention,
137 Queue,
138 Tree,
139 Push,
140 Pop,
141 Peek,
142 Purge,
143 Ack,
144 Nack,
145 Priority,
146
147 Neighborhood,
149 ShortestPath,
150 Centrality,
151 Community,
152 Components,
153 Cycles,
154 Traverse,
155 Depth,
156 Direction,
157 Algorithm,
158 Strategy,
159 MaxIterations,
160 MaxLength,
161 Mode,
162 Clustering,
163 TopologicalSort,
164 Properties,
165 Text,
166 Fuzzy,
167 MinScore,
168
169 Begin,
171 Commit,
172 Rollback,
173 Savepoint,
174 Release,
175 Start,
176 Transaction,
177 Work,
178
179 Vacuum,
181 Analyze,
182
183 Schema,
185 Sequence,
186 Increment,
187
188 Copy,
190 Header,
191 Delimiter,
192
193 View,
195 Materialized,
196 Refresh,
197
198 Partition,
200 Range,
201 List,
202 Hash,
203 Attach,
204 Detach,
205 Of,
206
207 Policy,
209 Enable,
210 Disable,
211 Security,
212 Row,
213 Level,
214
215 Foreign,
217 Server,
218 Wrapper,
219 Options,
220 Data,
221
222 String(String),
224 Integer(i64),
225 Float(f64),
226 JsonLiteral(String),
232
233 Ident(String),
235
236 Eq, Ne, Lt, Le, Gt, Ge, Plus, Minus, Star, Slash, Percent, LParen, RParen, LBracket, RBracket, LBrace, RBrace, Comma, Dot, Colon, Semi, Dollar, Arrow, ArrowLeft, Dash, DotDot, Pipe, DoublePipe, Eof,
272}
273
274impl fmt::Display for Token {
275 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
276 match self {
277 Token::Select => write!(f, "SELECT"),
278 Token::From => write!(f, "FROM"),
279 Token::Where => write!(f, "WHERE"),
280 Token::And => write!(f, "AND"),
281 Token::Or => write!(f, "OR"),
282 Token::Not => write!(f, "NOT"),
283 Token::Match => write!(f, "MATCH"),
284 Token::Return => write!(f, "RETURN"),
285 Token::Join => write!(f, "JOIN"),
286 Token::Graph => write!(f, "GRAPH"),
287 Token::Path => write!(f, "PATH"),
288 Token::To => write!(f, "TO"),
289 Token::Via => write!(f, "VIA"),
290 Token::On => write!(f, "ON"),
291 Token::As => write!(f, "AS"),
292 Token::Is => write!(f, "IS"),
293 Token::Null => write!(f, "NULL"),
294 Token::Between => write!(f, "BETWEEN"),
295 Token::Like => write!(f, "LIKE"),
296 Token::In => write!(f, "IN"),
297 Token::Order => write!(f, "ORDER"),
298 Token::By => write!(f, "BY"),
299 Token::Asc => write!(f, "ASC"),
300 Token::Desc => write!(f, "DESC"),
301 Token::Nulls => write!(f, "NULLS"),
302 Token::First => write!(f, "FIRST"),
303 Token::Last => write!(f, "LAST"),
304 Token::Limit => write!(f, "LIMIT"),
305 Token::Offset => write!(f, "OFFSET"),
306 Token::Inner => write!(f, "INNER"),
307 Token::Left => write!(f, "LEFT"),
308 Token::Right => write!(f, "RIGHT"),
309 Token::Outer => write!(f, "OUTER"),
310 Token::Full => write!(f, "FULL"),
311 Token::Cross => write!(f, "CROSS"),
312 Token::Starts => write!(f, "STARTS"),
313 Token::Ends => write!(f, "ENDS"),
314 Token::With => write!(f, "WITH"),
315 Token::Contains => write!(f, "CONTAINS"),
316 Token::True => write!(f, "TRUE"),
317 Token::False => write!(f, "FALSE"),
318 Token::Enrich => write!(f, "ENRICH"),
319 Token::Group => write!(f, "GROUP"),
320 Token::Count => write!(f, "COUNT"),
321 Token::Sum => write!(f, "SUM"),
322 Token::Avg => write!(f, "AVG"),
323 Token::Min => write!(f, "MIN"),
324 Token::Max => write!(f, "MAX"),
325 Token::Distinct => write!(f, "DISTINCT"),
326 Token::Vector => write!(f, "VECTOR"),
327 Token::Search => write!(f, "SEARCH"),
328 Token::Similar => write!(f, "SIMILAR"),
329 Token::Collection => write!(f, "COLLECTION"),
330 Token::Metric => write!(f, "METRIC"),
331 Token::Threshold => write!(f, "THRESHOLD"),
332 Token::K => write!(f, "K"),
333 Token::Hybrid => write!(f, "HYBRID"),
334 Token::Fusion => write!(f, "FUSION"),
335 Token::Rerank => write!(f, "RERANK"),
336 Token::Rrf => write!(f, "RRF"),
337 Token::Intersection => write!(f, "INTERSECTION"),
338 Token::Union => write!(f, "UNION"),
339 Token::Recursive => write!(f, "RECURSIVE"),
340 Token::All => write!(f, "ALL"),
341 Token::Weight => write!(f, "WEIGHT"),
342 Token::L2 => write!(f, "L2"),
343 Token::Cosine => write!(f, "COSINE"),
344 Token::InnerProduct => write!(f, "INNER_PRODUCT"),
345 Token::Include => write!(f, "INCLUDE"),
346 Token::Metadata => write!(f, "METADATA"),
347 Token::Vectors => write!(f, "VECTORS"),
348 Token::Explain => write!(f, "EXPLAIN"),
349 Token::For => write!(f, "FOR"),
350 Token::Format => write!(f, "FORMAT"),
351 Token::Json => write!(f, "JSON"),
352 Token::Insert => write!(f, "INSERT"),
353 Token::Into => write!(f, "INTO"),
354 Token::Values => write!(f, "VALUES"),
355 Token::Update => write!(f, "UPDATE"),
356 Token::Set => write!(f, "SET"),
357 Token::Delete => write!(f, "DELETE"),
358 Token::Truncate => write!(f, "TRUNCATE"),
359 Token::Create => write!(f, "CREATE"),
360 Token::Table => write!(f, "TABLE"),
361 Token::Drop => write!(f, "DROP"),
362 Token::Alter => write!(f, "ALTER"),
363 Token::Add => write!(f, "ADD"),
364 Token::Column => write!(f, "COLUMN"),
365 Token::Primary => write!(f, "PRIMARY"),
366 Token::Key => write!(f, "KEY"),
367 Token::Default => write!(f, "DEFAULT"),
368 Token::Compress => write!(f, "COMPRESS"),
369 Token::Index => write!(f, "INDEX"),
370 Token::Unique => write!(f, "UNIQUE"),
371 Token::If => write!(f, "IF"),
372 Token::Exists => write!(f, "EXISTS"),
373 Token::Returning => write!(f, "RETURNING"),
374 Token::Cascade => write!(f, "CASCADE"),
375 Token::Rename => write!(f, "RENAME"),
376 Token::Using => write!(f, "USING"),
377 Token::Node => write!(f, "NODE"),
378 Token::Edge => write!(f, "EDGE"),
379 Token::Document => write!(f, "DOCUMENT"),
380 Token::Kv => write!(f, "KV"),
381 Token::Timeseries => write!(f, "TIMESERIES"),
382 Token::Retention => write!(f, "RETENTION"),
383 Token::Queue => write!(f, "QUEUE"),
384 Token::Tree => write!(f, "TREE"),
385 Token::Push => write!(f, "PUSH"),
386 Token::Pop => write!(f, "POP"),
387 Token::Peek => write!(f, "PEEK"),
388 Token::Purge => write!(f, "PURGE"),
389 Token::Ack => write!(f, "ACK"),
390 Token::Nack => write!(f, "NACK"),
391 Token::Priority => write!(f, "PRIORITY"),
392 Token::Neighborhood => write!(f, "NEIGHBORHOOD"),
393 Token::ShortestPath => write!(f, "SHORTEST_PATH"),
394 Token::Centrality => write!(f, "CENTRALITY"),
395 Token::Community => write!(f, "COMMUNITY"),
396 Token::Components => write!(f, "COMPONENTS"),
397 Token::Cycles => write!(f, "CYCLES"),
398 Token::Traverse => write!(f, "TRAVERSE"),
399 Token::Depth => write!(f, "DEPTH"),
400 Token::Direction => write!(f, "DIRECTION"),
401 Token::Algorithm => write!(f, "ALGORITHM"),
402 Token::Strategy => write!(f, "STRATEGY"),
403 Token::MaxIterations => write!(f, "MAX_ITERATIONS"),
404 Token::MaxLength => write!(f, "MAX_LENGTH"),
405 Token::Mode => write!(f, "MODE"),
406 Token::Clustering => write!(f, "CLUSTERING"),
407 Token::TopologicalSort => write!(f, "TOPOLOGICAL_SORT"),
408 Token::Properties => write!(f, "PROPERTIES"),
409 Token::Text => write!(f, "TEXT"),
410 Token::Fuzzy => write!(f, "FUZZY"),
411 Token::MinScore => write!(f, "MIN_SCORE"),
412 Token::Begin => write!(f, "BEGIN"),
413 Token::Commit => write!(f, "COMMIT"),
414 Token::Rollback => write!(f, "ROLLBACK"),
415 Token::Savepoint => write!(f, "SAVEPOINT"),
416 Token::Release => write!(f, "RELEASE"),
417 Token::Start => write!(f, "START"),
418 Token::Transaction => write!(f, "TRANSACTION"),
419 Token::Work => write!(f, "WORK"),
420 Token::Vacuum => write!(f, "VACUUM"),
421 Token::Analyze => write!(f, "ANALYZE"),
422 Token::Schema => write!(f, "SCHEMA"),
423 Token::Sequence => write!(f, "SEQUENCE"),
424 Token::Increment => write!(f, "INCREMENT"),
425 Token::Copy => write!(f, "COPY"),
426 Token::Header => write!(f, "HEADER"),
427 Token::Delimiter => write!(f, "DELIMITER"),
428 Token::View => write!(f, "VIEW"),
429 Token::Materialized => write!(f, "MATERIALIZED"),
430 Token::Refresh => write!(f, "REFRESH"),
431 Token::Partition => write!(f, "PARTITION"),
432 Token::Range => write!(f, "RANGE"),
433 Token::List => write!(f, "LIST"),
434 Token::Hash => write!(f, "HASH"),
435 Token::Attach => write!(f, "ATTACH"),
436 Token::Detach => write!(f, "DETACH"),
437 Token::Of => write!(f, "OF"),
438 Token::Policy => write!(f, "POLICY"),
439 Token::Enable => write!(f, "ENABLE"),
440 Token::Disable => write!(f, "DISABLE"),
441 Token::Security => write!(f, "SECURITY"),
442 Token::Row => write!(f, "ROW"),
443 Token::Level => write!(f, "LEVEL"),
444 Token::Foreign => write!(f, "FOREIGN"),
445 Token::Server => write!(f, "SERVER"),
446 Token::Wrapper => write!(f, "WRAPPER"),
447 Token::Options => write!(f, "OPTIONS"),
448 Token::Data => write!(f, "DATA"),
449 Token::String(s) => write!(f, "'{}'", s),
450 Token::Integer(n) => write!(f, "{}", n),
451 Token::Float(n) => write!(f, "{}", n),
452 Token::JsonLiteral(s) => write!(f, "{}", s),
453 Token::Ident(s) => write!(f, "{}", s),
454 Token::Eq => write!(f, "="),
455 Token::Ne => write!(f, "<>"),
456 Token::Lt => write!(f, "<"),
457 Token::Le => write!(f, "<="),
458 Token::Gt => write!(f, ">"),
459 Token::Ge => write!(f, ">="),
460 Token::Plus => write!(f, "+"),
461 Token::Minus => write!(f, "-"),
462 Token::Star => write!(f, "*"),
463 Token::Slash => write!(f, "/"),
464 Token::Percent => write!(f, "%"),
465 Token::LParen => write!(f, "("),
466 Token::RParen => write!(f, ")"),
467 Token::LBracket => write!(f, "["),
468 Token::RBracket => write!(f, "]"),
469 Token::LBrace => write!(f, "{{"),
470 Token::RBrace => write!(f, "}}"),
471 Token::Comma => write!(f, ","),
472 Token::Dot => write!(f, "."),
473 Token::Colon => write!(f, ":"),
474 Token::Semi => write!(f, ";"),
475 Token::Dollar => write!(f, "$"),
476 Token::Arrow => write!(f, "->"),
477 Token::ArrowLeft => write!(f, "<-"),
478 Token::Dash => write!(f, "-"),
479 Token::DotDot => write!(f, ".."),
480 Token::Pipe => write!(f, "|"),
481 Token::DoublePipe => write!(f, "||"),
482 Token::Eof => write!(f, "EOF"),
483 }
484 }
485}
486
487#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
489pub struct Position {
490 pub line: u32,
492 pub column: u32,
494 pub offset: u32,
496}
497
498impl Position {
499 pub fn new(line: u32, column: u32, offset: u32) -> Self {
501 Self {
502 line,
503 column,
504 offset,
505 }
506 }
507}
508
509impl fmt::Display for Position {
510 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
511 write!(f, "{}:{}", self.line, self.column)
512 }
513}
514
515#[derive(Debug, Clone)]
517pub struct Spanned {
518 pub token: Token,
520 pub start: Position,
522 pub end: Position,
524}
525
526impl Spanned {
527 pub fn new(token: Token, start: Position, end: Position) -> Self {
529 Self { token, start, end }
530 }
531}
532
533#[derive(Debug, Clone)]
535pub struct LexerError {
536 pub message: String,
538 pub position: Position,
540 pub limit_hit: Option<LexerLimitHit>,
544}
545
546#[derive(Debug, Clone, PartialEq, Eq)]
548pub enum LexerLimitHit {
549 IdentifierTooLong {
551 limit_name: &'static str,
552 value: usize,
553 },
554}
555
556impl LexerError {
557 pub fn new(message: impl Into<String>, position: Position) -> Self {
559 Self {
560 message: message.into(),
561 position,
562 limit_hit: None,
563 }
564 }
565
566 pub(crate) fn with_limit(
568 message: impl Into<String>,
569 position: Position,
570 limit_hit: LexerLimitHit,
571 ) -> Self {
572 Self {
573 message: message.into(),
574 position,
575 limit_hit: Some(limit_hit),
576 }
577 }
578}
579
580impl fmt::Display for LexerError {
581 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
582 write!(f, "Lexer error at {}: {}", self.position, self.message)
583 }
584}
585
586impl std::error::Error for LexerError {}
587
588pub const JSON_LITERAL_MAX_BYTES: usize = 16 * 1024 * 1024;
595
596pub struct Lexer<'a> {
598 input: &'a str,
601 chars: Peekable<Chars<'a>>,
603 line: u32,
605 column: u32,
606 offset: u32,
607 peeked: Option<Spanned>,
609 putback: Option<(char, Position)>,
611 max_identifier_chars: usize,
613}
614
615impl<'a> Lexer<'a> {
616 pub fn new(input: &'a str) -> Self {
618 Self::with_limits(
619 input,
620 crate::storage::query::parser::ParserLimits::default(),
621 )
622 }
623
624 pub fn with_limits(
626 input: &'a str,
627 limits: crate::storage::query::parser::ParserLimits,
628 ) -> Self {
629 Self {
630 input,
631 chars: input.chars().peekable(),
632 line: 1,
633 column: 1,
634 offset: 0,
635 peeked: None,
636 putback: None,
637 max_identifier_chars: limits.max_identifier_chars,
638 }
639 }
640
641 pub(crate) fn max_identifier_chars(&self) -> usize {
645 self.max_identifier_chars
646 }
647
648 fn position(&self) -> Position {
650 Position::new(self.line, self.column, self.offset)
651 }
652
653 fn unget(&mut self, ch: char, pos: Position) {
655 self.putback = Some((ch, pos));
656 }
657
658 fn advance(&mut self) -> Option<char> {
660 if let Some((ch, pos)) = self.putback.take() {
662 self.line = pos.line;
664 self.column = pos.column + 1;
665 self.offset = pos.offset + ch.len_utf8() as u32;
666 return Some(ch);
667 }
668
669 let ch = self.chars.next()?;
670 self.offset += ch.len_utf8() as u32;
671 if ch == '\n' {
672 self.line += 1;
673 self.column = 1;
674 } else {
675 self.column += 1;
676 }
677 Some(ch)
678 }
679
680 fn peek(&mut self) -> Option<char> {
682 if let Some((ch, _)) = &self.putback {
684 return Some(*ch);
685 }
686 self.chars.peek().copied()
687 }
688
689 fn skip_whitespace(&mut self) {
691 while let Some(ch) = self.peek() {
692 if ch.is_whitespace() {
693 self.advance();
694 } else if ch == '-' {
695 let pos = self.position();
697 self.advance();
698 if self.peek() == Some('-') {
699 self.advance();
701 while let Some(c) = self.peek() {
702 if c == '\n' {
703 break;
704 }
705 self.advance();
706 }
707 } else {
708 self.line = pos.line;
711 self.column = pos.column;
712 self.offset = pos.offset;
713 break;
716 }
717 } else {
718 break;
719 }
720 }
721 }
722
723 pub fn peek_token(&mut self) -> Result<&Spanned, LexerError> {
725 if self.peeked.is_none() {
726 self.peeked = Some(self.next_token_internal()?);
727 }
728 Ok(self.peeked.as_ref().unwrap())
729 }
730
731 pub fn next_token(&mut self) -> Result<Spanned, LexerError> {
733 if let Some(tok) = self.peeked.take() {
734 return Ok(tok);
735 }
736 self.next_token_internal()
737 }
738
739 fn next_token_internal(&mut self) -> Result<Spanned, LexerError> {
741 self.skip_whitespace_simple();
742
743 let start = self.position();
744
745 let ch = match self.peek() {
746 Some(c) => c,
747 None => {
748 return Ok(Spanned::new(Token::Eof, start, start));
749 }
750 };
751
752 let token = match ch {
754 '\'' | '"' => self.scan_string()?,
756
757 '0'..='9' => self.scan_number()?,
759
760 'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier()?,
762
763 '=' => {
765 self.advance();
766 Token::Eq
767 }
768 '<' => self.scan_less_than()?,
769 '>' => self.scan_greater_than()?,
770 '!' => {
771 self.advance();
772 if self.peek() == Some('=') {
773 self.advance();
774 Token::Ne
775 } else {
776 return Err(LexerError::new("Expected '=' after '!'", start));
777 }
778 }
779 '+' => {
780 self.advance();
781 Token::Plus
782 }
783 '-' => self.scan_minus()?,
784 '*' => {
785 self.advance();
786 Token::Star
787 }
788 '/' => {
789 self.advance();
790 Token::Slash
791 }
792 '%' => {
793 self.advance();
794 Token::Percent
795 }
796 '(' => {
797 self.advance();
798 Token::LParen
799 }
800 ')' => {
801 self.advance();
802 Token::RParen
803 }
804 '[' => {
805 self.advance();
806 Token::LBracket
807 }
808 ']' => {
809 self.advance();
810 Token::RBracket
811 }
812 '{' => {
813 if self.looks_like_json_object_start() {
820 return self.scan_json_literal(start);
821 }
822 self.advance();
823 Token::LBrace
824 }
825 '}' => {
826 self.advance();
827 Token::RBrace
828 }
829 ',' => {
830 self.advance();
831 Token::Comma
832 }
833 '.' => self.scan_dot()?,
834 ':' => {
835 self.advance();
836 Token::Colon
837 }
838 ';' => {
839 self.advance();
840 Token::Semi
841 }
842 '$' => {
843 self.advance();
844 Token::Dollar
845 }
846 '|' => {
847 self.advance();
848 if self.peek() == Some('|') {
849 self.advance();
850 Token::DoublePipe
851 } else {
852 Token::Pipe
853 }
854 }
855 _ => {
856 return Err(LexerError::new(
857 format!("Unexpected character: '{}'", ch),
858 start,
859 ));
860 }
861 };
862
863 let end = self.position();
864 Ok(Spanned::new(token, start, end))
865 }
866
867 fn skip_whitespace_simple(&mut self) {
869 while let Some(ch) = self.peek() {
870 if ch.is_whitespace() {
871 self.advance();
872 } else if ch == '-' && self.input[self.offset as usize..].starts_with("--") {
873 self.advance();
874 self.advance();
875 while let Some(c) = self.peek() {
876 if c == '\n' {
877 break;
878 }
879 self.advance();
880 }
881 } else if ch == '/' && self.input[self.offset as usize..].starts_with("/*") {
882 self.advance();
883 self.advance();
884 while let Some(c) = self.peek() {
885 self.advance();
886 if c == '*' && self.peek() == Some('/') {
887 self.advance();
888 break;
889 }
890 }
891 } else {
892 break;
893 }
894 }
895 }
896
897 fn scan_string(&mut self) -> Result<Token, LexerError> {
899 let quote = self.advance().unwrap(); let start = self.position();
901 let mut value = String::new();
902
903 loop {
904 match self.peek() {
905 None => {
906 return Err(LexerError::new("Unterminated string", start));
907 }
908 Some(c) if c == quote => {
909 self.advance();
910 if self.peek() == Some(quote) {
912 self.advance();
913 value.push(quote);
914 } else {
915 break;
916 }
917 }
918 Some('\\') => {
919 self.advance();
920 match self.peek() {
921 Some('n') => {
922 self.advance();
923 value.push('\n');
924 }
925 Some('r') => {
926 self.advance();
927 value.push('\r');
928 }
929 Some('t') => {
930 self.advance();
931 value.push('\t');
932 }
933 Some('\\') => {
934 self.advance();
935 value.push('\\');
936 }
937 Some(c) if c == quote => {
938 self.advance();
939 value.push(quote);
940 }
941 Some(c) => {
942 value.push('\\');
944 value.push(c);
945 self.advance();
946 }
947 None => {
948 return Err(LexerError::new("Unterminated string", start));
949 }
950 }
951 }
952 Some(c) => {
953 self.advance();
954 value.push(c);
955 }
956 }
957 }
958
959 Ok(Token::String(value))
960 }
961
962 fn scan_number(&mut self) -> Result<Token, LexerError> {
964 let mut value = String::new();
965 let mut is_float = false;
966
967 while let Some(ch) = self.peek() {
969 if ch.is_ascii_digit() {
970 value.push(ch);
971 self.advance();
972 } else {
973 break;
974 }
975 }
976
977 if self.peek() == Some('.') {
979 let dot_pos = self.position();
981 self.advance(); if self.peek() == Some('.') {
984 self.unget('.', dot_pos);
986 } else if self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
988 is_float = true;
989 value.push('.');
990 while let Some(ch) = self.peek() {
991 if ch.is_ascii_digit() {
992 value.push(ch);
993 self.advance();
994 } else {
995 break;
996 }
997 }
998 } else {
999 self.unget('.', dot_pos);
1001 }
1002 }
1003
1004 if self.peek() == Some('e') || self.peek() == Some('E') {
1006 is_float = true;
1007 value.push(self.advance().unwrap());
1008
1009 if self.peek() == Some('+') || self.peek() == Some('-') {
1010 value.push(self.advance().unwrap());
1011 }
1012
1013 while let Some(ch) = self.peek() {
1014 if ch.is_ascii_digit() {
1015 value.push(ch);
1016 self.advance();
1017 } else {
1018 break;
1019 }
1020 }
1021 }
1022
1023 if is_float {
1024 match value.parse::<f64>() {
1025 Ok(n) => Ok(Token::Float(n)),
1026 Err(_) => Err(LexerError::new(
1027 format!("Invalid float: {}", value),
1028 self.position(),
1029 )),
1030 }
1031 } else {
1032 match value.parse::<i64>() {
1033 Ok(n) => Ok(Token::Integer(n)),
1034 Err(_) => Err(LexerError::new(
1035 format!("Invalid integer: {}", value),
1036 self.position(),
1037 )),
1038 }
1039 }
1040 }
1041
1042 fn scan_identifier(&mut self) -> Result<Token, LexerError> {
1044 let start_pos = self.position();
1045 let mut value = String::new();
1046 let max = self.max_identifier_chars;
1047
1048 while let Some(ch) = self.peek() {
1049 if ch.is_alphanumeric() || ch == '_' {
1050 if value.chars().count() >= max {
1051 return Err(LexerError::with_limit(
1055 format!(
1056 "identifier exceeds maximum length (max_identifier_chars = {})",
1057 max
1058 ),
1059 start_pos,
1060 LexerLimitHit::IdentifierTooLong {
1061 limit_name: "max_identifier_chars",
1062 value: max,
1063 },
1064 ));
1065 }
1066 value.push(ch);
1067 self.advance();
1068 } else {
1069 break;
1070 }
1071 }
1072
1073 let token = match value.to_uppercase().as_str() {
1075 "SELECT" => Token::Select,
1076 "FROM" => Token::From,
1077 "WHERE" => Token::Where,
1078 "AND" => Token::And,
1079 "OR" => Token::Or,
1080 "NOT" => Token::Not,
1081 "MATCH" => Token::Match,
1082 "RETURN" => Token::Return,
1083 "JOIN" => Token::Join,
1084 "GRAPH" => Token::Graph,
1085 "PATH" => Token::Path,
1086 "TO" => Token::To,
1087 "VIA" => Token::Via,
1088 "ON" => Token::On,
1089 "AS" => Token::As,
1090 "IS" => Token::Is,
1091 "NULL" => Token::Null,
1092 "BETWEEN" => Token::Between,
1093 "LIKE" => Token::Like,
1094 "IN" => Token::In,
1095 "ORDER" => Token::Order,
1096 "BY" => Token::By,
1097 "ASC" => Token::Asc,
1098 "DESC" => Token::Desc,
1099 "NULLS" => Token::Nulls,
1100 "FIRST" => Token::First,
1101 "LAST" => Token::Last,
1102 "LIMIT" => Token::Limit,
1103 "OFFSET" => Token::Offset,
1104 "INNER" => Token::Inner,
1105 "LEFT" => Token::Left,
1106 "RIGHT" => Token::Right,
1107 "OUTER" => Token::Outer,
1108 "FULL" => Token::Full,
1109 "CROSS" => Token::Cross,
1110 "STARTS" => Token::Starts,
1111 "ENDS" => Token::Ends,
1112 "WITH" => Token::With,
1113 "CONTAINS" => Token::Contains,
1114 "TRUE" => Token::True,
1115 "FALSE" => Token::False,
1116 "ENRICH" => Token::Enrich,
1117 "GROUP" => Token::Group,
1118 "COUNT" => Token::Count,
1119 "SUM" => Token::Sum,
1120 "AVG" => Token::Avg,
1121 "MIN" => Token::Min,
1122 "MAX" => Token::Max,
1123 "DISTINCT" => Token::Distinct,
1124 "VECTOR" => Token::Vector,
1125 "SEARCH" => Token::Search,
1126 "SIMILAR" => Token::Similar,
1127 "COLLECTION" => Token::Collection,
1128 "METRIC" => Token::Metric,
1129 "THRESHOLD" => Token::Threshold,
1130 "K" => Token::K,
1131 "HYBRID" => Token::Hybrid,
1132 "FUSION" => Token::Fusion,
1133 "RERANK" => Token::Rerank,
1134 "RRF" => Token::Rrf,
1135 "INTERSECTION" => Token::Intersection,
1136 "UNION" => Token::Union,
1137 "RECURSIVE" => Token::Recursive,
1138 "ALL" => Token::All,
1139 "WEIGHT" => Token::Weight,
1140 "L2" => Token::L2,
1141 "COSINE" => Token::Cosine,
1142 "INNER_PRODUCT" | "INNERPRODUCT" => Token::InnerProduct,
1143 "INCLUDE" => Token::Include,
1144 "METADATA" => Token::Metadata,
1145 "VECTORS" => Token::Vectors,
1146 "EXPLAIN" => Token::Explain,
1147 "FOR" => Token::For,
1148 "FORMAT" => Token::Format,
1149 "JSON" => Token::Json,
1150 "INSERT" => Token::Insert,
1151 "INTO" => Token::Into,
1152 "VALUES" => Token::Values,
1153 "UPDATE" => Token::Update,
1154 "SET" => Token::Set,
1155 "DELETE" => Token::Delete,
1156 "TRUNCATE" => Token::Truncate,
1157 "CREATE" => Token::Create,
1158 "TABLE" => Token::Table,
1159 "DROP" => Token::Drop,
1160 "ALTER" => Token::Alter,
1161 "ADD" => Token::Add,
1162 "COLUMN" => Token::Column,
1163 "PRIMARY" => Token::Primary,
1164 "KEY" => Token::Key,
1165 "DEFAULT" => Token::Default,
1166 "COMPRESS" => Token::Compress,
1167 "INDEX" => Token::Index,
1168 "UNIQUE" => Token::Unique,
1169 "IF" => Token::If,
1170 "EXISTS" => Token::Exists,
1171 "RETURNING" => Token::Returning,
1172 "CASCADE" => Token::Cascade,
1173 "RENAME" => Token::Rename,
1174 "USING" => Token::Using,
1175 "NODE" => Token::Node,
1176 "EDGE" => Token::Edge,
1177 "DOCUMENT" => Token::Document,
1178 "KV" => Token::Kv,
1179 "TIMESERIES" => Token::Timeseries,
1180 "RETENTION" => Token::Retention,
1181 "QUEUE" => Token::Queue,
1182 "TREE" => Token::Tree,
1183 "PUSH" => Token::Push,
1184 "POP" => Token::Pop,
1185 "PEEK" => Token::Peek,
1186 "PURGE" => Token::Purge,
1187 "ACK" => Token::Ack,
1188 "NACK" => Token::Nack,
1189 "PRIORITY" => Token::Priority,
1190 "LPUSH" => Token::Ident("LPUSH".to_string()),
1191 "RPUSH" => Token::Ident("RPUSH".to_string()),
1192 "LPOP" => Token::Ident("LPOP".to_string()),
1193 "RPOP" => Token::Ident("RPOP".to_string()),
1194 "NEIGHBORHOOD" => Token::Neighborhood,
1195 "SHORTEST_PATH" | "SHORTESTPATH" => Token::ShortestPath,
1196 "CENTRALITY" => Token::Centrality,
1197 "COMMUNITY" => Token::Community,
1198 "COMPONENTS" => Token::Components,
1199 "CYCLES" => Token::Cycles,
1200 "TRAVERSE" => Token::Traverse,
1201 "DEPTH" => Token::Depth,
1202 "DIRECTION" => Token::Direction,
1203 "ALGORITHM" => Token::Algorithm,
1204 "STRATEGY" => Token::Strategy,
1205 "MAX_ITERATIONS" | "MAXITERATIONS" => Token::MaxIterations,
1206 "MAX_LENGTH" | "MAXLENGTH" => Token::MaxLength,
1207 "MODE" => Token::Mode,
1208 "CLUSTERING" => Token::Clustering,
1209 "TOPOLOGICAL_SORT" | "TOPOLOGICALSORT" => Token::TopologicalSort,
1210 "PROPERTIES" => Token::Properties,
1211 "TEXT" => Token::Text,
1212 "FUZZY" => Token::Fuzzy,
1213 "MIN_SCORE" | "MINSCORE" => Token::MinScore,
1214 "BEGIN" => Token::Begin,
1215 "COMMIT" => Token::Commit,
1216 "ROLLBACK" => Token::Rollback,
1217 "SAVEPOINT" => Token::Savepoint,
1218 "RELEASE" => Token::Release,
1219 "START" => Token::Start,
1220 "TRANSACTION" => Token::Transaction,
1221 "WORK" => Token::Work,
1222 "VACUUM" => Token::Vacuum,
1223 "ANALYZE" => Token::Analyze,
1224 "SCHEMA" => Token::Schema,
1225 "SEQUENCE" => Token::Sequence,
1226 "INCREMENT" => Token::Increment,
1227 "COPY" => Token::Copy,
1228 "HEADER" => Token::Header,
1229 "DELIMITER" => Token::Delimiter,
1230 "VIEW" => Token::View,
1231 "MATERIALIZED" => Token::Materialized,
1232 "REFRESH" => Token::Refresh,
1233 "PARTITION" => Token::Partition,
1234 "RANGE" => Token::Range,
1235 "LIST" => Token::List,
1236 "HASH" => Token::Hash,
1237 "ATTACH" => Token::Attach,
1238 "DETACH" => Token::Detach,
1239 "OF" => Token::Of,
1240 "POLICY" => Token::Policy,
1241 "ENABLE" => Token::Enable,
1242 "DISABLE" => Token::Disable,
1243 "SECURITY" => Token::Security,
1244 "ROW" => Token::Row,
1245 "LEVEL" => Token::Level,
1246 "FOREIGN" => Token::Foreign,
1247 "SERVER" => Token::Server,
1248 "WRAPPER" => Token::Wrapper,
1249 "OPTIONS" => Token::Options,
1250 "DATA" => Token::Data,
1251 _ => Token::Ident(value),
1252 };
1253
1254 Ok(token)
1255 }
1256
1257 fn scan_less_than(&mut self) -> Result<Token, LexerError> {
1259 self.advance(); match self.peek() {
1261 Some('=') => {
1262 self.advance();
1263 Ok(Token::Le)
1264 }
1265 Some('>') => {
1266 self.advance();
1267 Ok(Token::Ne)
1268 }
1269 Some('-') => {
1270 self.advance();
1271 Ok(Token::ArrowLeft)
1272 }
1273 _ => Ok(Token::Lt),
1274 }
1275 }
1276
1277 fn scan_greater_than(&mut self) -> Result<Token, LexerError> {
1279 self.advance(); if self.peek() == Some('=') {
1281 self.advance();
1282 Ok(Token::Ge)
1283 } else {
1284 Ok(Token::Gt)
1285 }
1286 }
1287
1288 fn scan_minus(&mut self) -> Result<Token, LexerError> {
1290 self.advance(); match self.peek() {
1292 Some('>') => {
1293 self.advance();
1294 Ok(Token::Arrow)
1295 }
1296 Some('-') => {
1297 self.advance();
1299 while let Some(c) = self.peek() {
1300 if c == '\n' {
1301 break;
1302 }
1303 self.advance();
1304 }
1305 self.skip_whitespace_simple();
1307 if self.peek().is_none() {
1308 Ok(Token::Eof)
1309 } else {
1310 let next = self.next_token_internal()?;
1311 Ok(next.token)
1312 }
1313 }
1314 _ => Ok(Token::Dash),
1315 }
1316 }
1317
1318 fn scan_dot(&mut self) -> Result<Token, LexerError> {
1320 self.advance(); if self.peek() == Some('.') {
1322 self.advance();
1323 Ok(Token::DotDot)
1324 } else {
1325 Ok(Token::Dot)
1326 }
1327 }
1328
1329 fn looks_like_json_object_start(&self) -> bool {
1334 let bytes = self.input.as_bytes();
1335 let mut i = self.offset as usize;
1336 debug_assert!(bytes.get(i) == Some(&b'{'));
1338 i += 1;
1339 while i < bytes.len() {
1340 match bytes[i] {
1341 b' ' | b'\t' | b'\n' | b'\r' => i += 1,
1342 b'"' | b'}' => return true,
1343 _ => return false,
1344 }
1345 }
1346 false
1347 }
1348
1349 fn scan_json_literal(&mut self, start: Position) -> Result<Spanned, LexerError> {
1366 let start_offset = self.offset as usize;
1367 self.advance();
1369 let mut depth: u32 = 1;
1370 let mut in_string = false;
1371 let mut escape = false;
1372 loop {
1373 let ch = match self.peek() {
1374 Some(c) => c,
1375 None => {
1376 return Err(LexerError::new(
1377 format!(
1378 "unterminated JSON object literal (started at offset {})",
1379 start.offset
1380 ),
1381 self.position(),
1382 ));
1383 }
1384 };
1385
1386 let scanned_bytes = self.offset as usize - start_offset;
1388 if scanned_bytes > JSON_LITERAL_MAX_BYTES {
1389 return Err(LexerError::new(
1390 format!(
1391 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1392 JSON_LITERAL_MAX_BYTES
1393 ),
1394 start,
1395 ));
1396 }
1397
1398 self.advance();
1399
1400 if escape {
1401 escape = false;
1402 continue;
1403 }
1404
1405 if in_string {
1406 match ch {
1407 '\\' => escape = true,
1408 '"' => in_string = false,
1409 _ => {}
1410 }
1411 continue;
1412 }
1413
1414 match ch {
1415 '"' => in_string = true,
1416 '{' => depth += 1,
1417 '}' => {
1418 depth -= 1;
1419 if depth == 0 {
1420 let end = self.position();
1421 let end_offset = self.offset as usize;
1422 if end_offset - start_offset > JSON_LITERAL_MAX_BYTES {
1424 return Err(LexerError::new(
1425 format!(
1426 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1427 JSON_LITERAL_MAX_BYTES
1428 ),
1429 start,
1430 ));
1431 }
1432 let raw = self.input[start_offset..end_offset].to_string();
1433 return Ok(Spanned::new(Token::JsonLiteral(raw), start, end));
1434 }
1435 }
1436 _ => {}
1437 }
1438 }
1439 }
1440
1441 pub fn tokenize(&mut self) -> Result<Vec<Spanned>, LexerError> {
1443 let mut tokens = Vec::new();
1444 loop {
1445 let tok = self.next_token()?;
1446 let is_eof = tok.token == Token::Eof;
1447 tokens.push(tok);
1448 if is_eof {
1449 break;
1450 }
1451 }
1452 Ok(tokens)
1453 }
1454}
1455
1456#[cfg(test)]
1461mod tests {
1462 use super::*;
1463
1464 fn tokenize(input: &str) -> Vec<Token> {
1465 let mut lexer = Lexer::new(input);
1466 lexer
1467 .tokenize()
1468 .unwrap()
1469 .into_iter()
1470 .map(|s| s.token)
1471 .collect()
1472 }
1473
1474 #[test]
1475 fn test_keywords() {
1476 let tokens = tokenize("SELECT FROM WHERE AND OR NOT");
1477 assert_eq!(
1478 tokens,
1479 vec![
1480 Token::Select,
1481 Token::From,
1482 Token::Where,
1483 Token::And,
1484 Token::Or,
1485 Token::Not,
1486 Token::Eof
1487 ]
1488 );
1489 }
1490
1491 #[test]
1492 fn test_identifiers() {
1493 let tokens = tokenize("hosts users ip_address");
1494 assert_eq!(
1495 tokens,
1496 vec![
1497 Token::Ident("hosts".into()),
1498 Token::Ident("users".into()),
1499 Token::Ident("ip_address".into()),
1500 Token::Eof
1501 ]
1502 );
1503 }
1504
1505 #[test]
1506 fn test_numbers() {
1507 let tokens = tokenize("42 2.5 1e10 2.5e-3");
1508 assert_eq!(
1509 tokens,
1510 vec![
1511 Token::Integer(42),
1512 Token::Float(2.5),
1513 Token::Float(1e10),
1514 Token::Float(2.5e-3),
1515 Token::Eof
1516 ]
1517 );
1518 }
1519
1520 #[test]
1521 fn test_strings() {
1522 let tokens = tokenize("'hello' \"world\" 'it''s'");
1523 assert_eq!(
1524 tokens,
1525 vec![
1526 Token::String("hello".into()),
1527 Token::String("world".into()),
1528 Token::String("it's".into()),
1529 Token::Eof
1530 ]
1531 );
1532 }
1533
1534 #[test]
1535 fn test_operators() {
1536 let tokens = tokenize("= <> < <= > >= != + - * /");
1537 assert_eq!(
1538 tokens,
1539 vec![
1540 Token::Eq,
1541 Token::Ne,
1542 Token::Lt,
1543 Token::Le,
1544 Token::Gt,
1545 Token::Ge,
1546 Token::Ne,
1547 Token::Plus,
1548 Token::Dash,
1549 Token::Star,
1550 Token::Slash,
1551 Token::Eof
1552 ]
1553 );
1554 }
1555
1556 #[test]
1557 fn test_delimiters() {
1558 let tokens = tokenize("( ) [ ] { a } , . : ;");
1563 assert_eq!(
1564 tokens,
1565 vec![
1566 Token::LParen,
1567 Token::RParen,
1568 Token::LBracket,
1569 Token::RBracket,
1570 Token::LBrace,
1571 Token::Ident("a".into()),
1572 Token::RBrace,
1573 Token::Comma,
1574 Token::Dot,
1575 Token::Colon,
1576 Token::Semi,
1577 Token::Eof
1578 ]
1579 );
1580 }
1581
1582 #[test]
1583 fn test_json_literal_empty_object() {
1584 let tokens = tokenize("{ }");
1585 assert_eq!(tokens, vec![Token::JsonLiteral("{ }".into()), Token::Eof]);
1586 }
1587
1588 #[test]
1589 fn test_json_literal_simple() {
1590 let tokens = tokenize(r#"{"a":1}"#);
1591 assert_eq!(
1592 tokens,
1593 vec![Token::JsonLiteral(r#"{"a":1}"#.into()), Token::Eof]
1594 );
1595 }
1596
1597 #[test]
1598 fn test_json_literal_nested() {
1599 let raw = r#"{"a":{"b":[1,2,{"c":"}"}]}}"#;
1600 let tokens = tokenize(raw);
1601 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1602 }
1603
1604 #[test]
1605 fn test_json_literal_escaped_quote_in_string() {
1606 let raw = r#"{"path":"O\"Brien}"}"#;
1608 let tokens = tokenize(raw);
1609 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1610 }
1611
1612 #[test]
1613 fn test_json_literal_unbalanced_eof() {
1614 let mut lexer = Lexer::new(r#"{"a":1"#);
1615 let err = lexer.tokenize().expect_err("expected unterminated error");
1616 assert!(
1617 err.message.contains("unterminated JSON object literal"),
1618 "got: {}",
1619 err.message
1620 );
1621 }
1622
1623 #[test]
1624 fn test_json_literal_property_bag_compatible() {
1625 let tokens = tokenize("{name: 'value'}");
1628 assert_eq!(tokens[0], Token::LBrace);
1629 assert_eq!(*tokens.last().unwrap(), Token::Eof);
1630 }
1631
1632 #[test]
1633 fn test_graph_syntax() {
1634 let tokens = tokenize("-> <- - ..");
1635 assert_eq!(
1636 tokens,
1637 vec![
1638 Token::Arrow,
1639 Token::ArrowLeft,
1640 Token::Dash,
1641 Token::DotDot,
1642 Token::Eof
1643 ]
1644 );
1645 }
1646
1647 #[test]
1648 fn test_table_query() {
1649 let tokens = tokenize("SELECT ip, hostname FROM hosts WHERE os = 'Linux' LIMIT 10");
1650 assert_eq!(
1651 tokens,
1652 vec![
1653 Token::Select,
1654 Token::Ident("ip".into()),
1655 Token::Comma,
1656 Token::Ident("hostname".into()),
1657 Token::From,
1658 Token::Ident("hosts".into()),
1659 Token::Where,
1660 Token::Ident("os".into()),
1661 Token::Eq,
1662 Token::String("Linux".into()),
1663 Token::Limit,
1664 Token::Integer(10),
1665 Token::Eof
1666 ]
1667 );
1668 }
1669
1670 #[test]
1671 fn test_graph_query() {
1672 let tokens = tokenize("MATCH (h:Host)-[:HAS_SERVICE]->(s:Service) RETURN h, s");
1673 assert_eq!(
1674 tokens,
1675 vec![
1676 Token::Match,
1677 Token::LParen,
1678 Token::Ident("h".into()),
1679 Token::Colon,
1680 Token::Ident("Host".into()),
1681 Token::RParen,
1682 Token::Dash,
1683 Token::LBracket,
1684 Token::Colon,
1685 Token::Ident("HAS_SERVICE".into()),
1686 Token::RBracket,
1687 Token::Arrow,
1688 Token::LParen,
1689 Token::Ident("s".into()),
1690 Token::Colon,
1691 Token::Ident("Service".into()),
1692 Token::RParen,
1693 Token::Return,
1694 Token::Ident("h".into()),
1695 Token::Comma,
1696 Token::Ident("s".into()),
1697 Token::Eof
1698 ]
1699 );
1700 }
1701
1702 #[test]
1703 fn test_join_query() {
1704 let tokens = tokenize("FROM hosts h JOIN GRAPH (h)-[:HAS_VULN]->(v) ON h.ip = v.id");
1705 assert_eq!(
1706 tokens,
1707 vec![
1708 Token::From,
1709 Token::Ident("hosts".into()),
1710 Token::Ident("h".into()),
1711 Token::Join,
1712 Token::Graph,
1713 Token::LParen,
1714 Token::Ident("h".into()),
1715 Token::RParen,
1716 Token::Dash,
1717 Token::LBracket,
1718 Token::Colon,
1719 Token::Ident("HAS_VULN".into()),
1720 Token::RBracket,
1721 Token::Arrow,
1722 Token::LParen,
1723 Token::Ident("v".into()),
1724 Token::RParen,
1725 Token::On,
1726 Token::Ident("h".into()),
1727 Token::Dot,
1728 Token::Ident("ip".into()),
1729 Token::Eq,
1730 Token::Ident("v".into()),
1731 Token::Dot,
1732 Token::Ident("id".into()),
1733 Token::Eof
1734 ]
1735 );
1736 }
1737
1738 #[test]
1739 fn test_path_query() {
1740 let tokens = tokenize("PATH FROM host('192.168.1.1') TO host('10.0.0.1') VIA [:AUTH]");
1741 assert_eq!(
1742 tokens,
1743 vec![
1744 Token::Path,
1745 Token::From,
1746 Token::Ident("host".into()),
1747 Token::LParen,
1748 Token::String("192.168.1.1".into()),
1749 Token::RParen,
1750 Token::To,
1751 Token::Ident("host".into()),
1752 Token::LParen,
1753 Token::String("10.0.0.1".into()),
1754 Token::RParen,
1755 Token::Via,
1756 Token::LBracket,
1757 Token::Colon,
1758 Token::Ident("AUTH".into()),
1759 Token::RBracket,
1760 Token::Eof
1761 ]
1762 );
1763 }
1764
1765 #[test]
1766 fn test_variable_length_pattern() {
1767 let tokens = tokenize("(a)-[*1..5]->(b)");
1768 assert_eq!(
1769 tokens,
1770 vec![
1771 Token::LParen,
1772 Token::Ident("a".into()),
1773 Token::RParen,
1774 Token::Dash,
1775 Token::LBracket,
1776 Token::Star,
1777 Token::Integer(1),
1778 Token::DotDot,
1779 Token::Integer(5),
1780 Token::RBracket,
1781 Token::Arrow,
1782 Token::LParen,
1783 Token::Ident("b".into()),
1784 Token::RParen,
1785 Token::Eof
1786 ]
1787 );
1788 }
1789
1790 #[test]
1791 fn test_case_insensitive_keywords() {
1792 let tokens = tokenize("select FROM Where AND");
1793 assert_eq!(
1794 tokens,
1795 vec![
1796 Token::Select,
1797 Token::From,
1798 Token::Where,
1799 Token::And,
1800 Token::Eof
1801 ]
1802 );
1803 }
1804
1805 #[test]
1806 fn test_comments() {
1807 let tokens = tokenize("SELECT -- this is a comment\nip FROM hosts");
1808 assert_eq!(
1809 tokens,
1810 vec![
1811 Token::Select,
1812 Token::Ident("ip".into()),
1813 Token::From,
1814 Token::Ident("hosts".into()),
1815 Token::Eof
1816 ]
1817 );
1818 }
1819
1820 #[test]
1821 fn test_escaped_strings() {
1822 let tokens = tokenize(r"'hello\nworld' 'tab\there'");
1823 assert_eq!(
1824 tokens,
1825 vec![
1826 Token::String("hello\nworld".into()),
1827 Token::String("tab\there".into()),
1828 Token::Eof
1829 ]
1830 );
1831 }
1832
1833 #[test]
1834 fn test_keyword_matrix_and_alias_spellings() {
1835 let cases = [
1836 ("SELECT", Token::Select),
1837 ("FROM", Token::From),
1838 ("WHERE", Token::Where),
1839 ("AND", Token::And),
1840 ("OR", Token::Or),
1841 ("NOT", Token::Not),
1842 ("MATCH", Token::Match),
1843 ("RETURN", Token::Return),
1844 ("JOIN", Token::Join),
1845 ("GRAPH", Token::Graph),
1846 ("PATH", Token::Path),
1847 ("TO", Token::To),
1848 ("VIA", Token::Via),
1849 ("ON", Token::On),
1850 ("AS", Token::As),
1851 ("IS", Token::Is),
1852 ("NULL", Token::Null),
1853 ("BETWEEN", Token::Between),
1854 ("LIKE", Token::Like),
1855 ("IN", Token::In),
1856 ("ORDER", Token::Order),
1857 ("BY", Token::By),
1858 ("ASC", Token::Asc),
1859 ("DESC", Token::Desc),
1860 ("NULLS", Token::Nulls),
1861 ("FIRST", Token::First),
1862 ("LAST", Token::Last),
1863 ("LIMIT", Token::Limit),
1864 ("OFFSET", Token::Offset),
1865 ("INNER", Token::Inner),
1866 ("LEFT", Token::Left),
1867 ("RIGHT", Token::Right),
1868 ("OUTER", Token::Outer),
1869 ("FULL", Token::Full),
1870 ("CROSS", Token::Cross),
1871 ("STARTS", Token::Starts),
1872 ("ENDS", Token::Ends),
1873 ("WITH", Token::With),
1874 ("CONTAINS", Token::Contains),
1875 ("TRUE", Token::True),
1876 ("FALSE", Token::False),
1877 ("ENRICH", Token::Enrich),
1878 ("GROUP", Token::Group),
1879 ("COUNT", Token::Count),
1880 ("SUM", Token::Sum),
1881 ("AVG", Token::Avg),
1882 ("MIN", Token::Min),
1883 ("MAX", Token::Max),
1884 ("DISTINCT", Token::Distinct),
1885 ("VECTOR", Token::Vector),
1886 ("SEARCH", Token::Search),
1887 ("SIMILAR", Token::Similar),
1888 ("COLLECTION", Token::Collection),
1889 ("METRIC", Token::Metric),
1890 ("THRESHOLD", Token::Threshold),
1891 ("K", Token::K),
1892 ("HYBRID", Token::Hybrid),
1893 ("FUSION", Token::Fusion),
1894 ("RERANK", Token::Rerank),
1895 ("RRF", Token::Rrf),
1896 ("INTERSECTION", Token::Intersection),
1897 ("UNION", Token::Union),
1898 ("RECURSIVE", Token::Recursive),
1899 ("ALL", Token::All),
1900 ("WEIGHT", Token::Weight),
1901 ("L2", Token::L2),
1902 ("COSINE", Token::Cosine),
1903 ("INNER_PRODUCT", Token::InnerProduct),
1904 ("INNERPRODUCT", Token::InnerProduct),
1905 ("INCLUDE", Token::Include),
1906 ("METADATA", Token::Metadata),
1907 ("VECTORS", Token::Vectors),
1908 ("EXPLAIN", Token::Explain),
1909 ("FOR", Token::For),
1910 ("FORMAT", Token::Format),
1911 ("JSON", Token::Json),
1912 ("INSERT", Token::Insert),
1913 ("INTO", Token::Into),
1914 ("VALUES", Token::Values),
1915 ("UPDATE", Token::Update),
1916 ("SET", Token::Set),
1917 ("DELETE", Token::Delete),
1918 ("TRUNCATE", Token::Truncate),
1919 ("CREATE", Token::Create),
1920 ("TABLE", Token::Table),
1921 ("DROP", Token::Drop),
1922 ("ALTER", Token::Alter),
1923 ("ADD", Token::Add),
1924 ("COLUMN", Token::Column),
1925 ("PRIMARY", Token::Primary),
1926 ("KEY", Token::Key),
1927 ("DEFAULT", Token::Default),
1928 ("COMPRESS", Token::Compress),
1929 ("INDEX", Token::Index),
1930 ("UNIQUE", Token::Unique),
1931 ("IF", Token::If),
1932 ("EXISTS", Token::Exists),
1933 ("RETURNING", Token::Returning),
1934 ("CASCADE", Token::Cascade),
1935 ("RENAME", Token::Rename),
1936 ("USING", Token::Using),
1937 ("NODE", Token::Node),
1938 ("EDGE", Token::Edge),
1939 ("DOCUMENT", Token::Document),
1940 ("KV", Token::Kv),
1941 ("TIMESERIES", Token::Timeseries),
1942 ("RETENTION", Token::Retention),
1943 ("QUEUE", Token::Queue),
1944 ("TREE", Token::Tree),
1945 ("PUSH", Token::Push),
1946 ("POP", Token::Pop),
1947 ("PEEK", Token::Peek),
1948 ("PURGE", Token::Purge),
1949 ("ACK", Token::Ack),
1950 ("NACK", Token::Nack),
1951 ("PRIORITY", Token::Priority),
1952 ("LPUSH", Token::Ident("LPUSH".into())),
1953 ("RPUSH", Token::Ident("RPUSH".into())),
1954 ("LPOP", Token::Ident("LPOP".into())),
1955 ("RPOP", Token::Ident("RPOP".into())),
1956 ("NEIGHBORHOOD", Token::Neighborhood),
1957 ("SHORTEST_PATH", Token::ShortestPath),
1958 ("SHORTESTPATH", Token::ShortestPath),
1959 ("CENTRALITY", Token::Centrality),
1960 ("COMMUNITY", Token::Community),
1961 ("COMPONENTS", Token::Components),
1962 ("CYCLES", Token::Cycles),
1963 ("TRAVERSE", Token::Traverse),
1964 ("DEPTH", Token::Depth),
1965 ("DIRECTION", Token::Direction),
1966 ("ALGORITHM", Token::Algorithm),
1967 ("STRATEGY", Token::Strategy),
1968 ("MAX_ITERATIONS", Token::MaxIterations),
1969 ("MAXITERATIONS", Token::MaxIterations),
1970 ("MAX_LENGTH", Token::MaxLength),
1971 ("MAXLENGTH", Token::MaxLength),
1972 ("MODE", Token::Mode),
1973 ("CLUSTERING", Token::Clustering),
1974 ("TOPOLOGICAL_SORT", Token::TopologicalSort),
1975 ("TOPOLOGICALSORT", Token::TopologicalSort),
1976 ("PROPERTIES", Token::Properties),
1977 ("TEXT", Token::Text),
1978 ("FUZZY", Token::Fuzzy),
1979 ("MIN_SCORE", Token::MinScore),
1980 ("MINSCORE", Token::MinScore),
1981 ("BEGIN", Token::Begin),
1982 ("COMMIT", Token::Commit),
1983 ("ROLLBACK", Token::Rollback),
1984 ("SAVEPOINT", Token::Savepoint),
1985 ("RELEASE", Token::Release),
1986 ("START", Token::Start),
1987 ("TRANSACTION", Token::Transaction),
1988 ("WORK", Token::Work),
1989 ("VACUUM", Token::Vacuum),
1990 ("ANALYZE", Token::Analyze),
1991 ("SCHEMA", Token::Schema),
1992 ("SEQUENCE", Token::Sequence),
1993 ("INCREMENT", Token::Increment),
1994 ("COPY", Token::Copy),
1995 ("HEADER", Token::Header),
1996 ("DELIMITER", Token::Delimiter),
1997 ("VIEW", Token::View),
1998 ("MATERIALIZED", Token::Materialized),
1999 ("REFRESH", Token::Refresh),
2000 ("PARTITION", Token::Partition),
2001 ("RANGE", Token::Range),
2002 ("LIST", Token::List),
2003 ("HASH", Token::Hash),
2004 ("ATTACH", Token::Attach),
2005 ("DETACH", Token::Detach),
2006 ("OF", Token::Of),
2007 ("POLICY", Token::Policy),
2008 ("ENABLE", Token::Enable),
2009 ("DISABLE", Token::Disable),
2010 ("SECURITY", Token::Security),
2011 ("ROW", Token::Row),
2012 ("LEVEL", Token::Level),
2013 ("FOREIGN", Token::Foreign),
2014 ("SERVER", Token::Server),
2015 ("WRAPPER", Token::Wrapper),
2016 ("OPTIONS", Token::Options),
2017 ("DATA", Token::Data),
2018 ("plain_ident", Token::Ident("plain_ident".into())),
2019 ];
2020
2021 for (input, expected) in cases {
2022 let tokens = tokenize(input);
2023 assert_eq!(tokens, vec![expected, Token::Eof], "{input}");
2024 }
2025 }
2026
2027 #[test]
2028 fn test_display_all_token_variants() {
2029 let cases = [
2030 (Token::Select, "SELECT"),
2031 (Token::From, "FROM"),
2032 (Token::Where, "WHERE"),
2033 (Token::And, "AND"),
2034 (Token::Or, "OR"),
2035 (Token::Not, "NOT"),
2036 (Token::Match, "MATCH"),
2037 (Token::Return, "RETURN"),
2038 (Token::Join, "JOIN"),
2039 (Token::Graph, "GRAPH"),
2040 (Token::Path, "PATH"),
2041 (Token::To, "TO"),
2042 (Token::Via, "VIA"),
2043 (Token::On, "ON"),
2044 (Token::As, "AS"),
2045 (Token::Is, "IS"),
2046 (Token::Null, "NULL"),
2047 (Token::Between, "BETWEEN"),
2048 (Token::Like, "LIKE"),
2049 (Token::In, "IN"),
2050 (Token::Order, "ORDER"),
2051 (Token::By, "BY"),
2052 (Token::Asc, "ASC"),
2053 (Token::Desc, "DESC"),
2054 (Token::Nulls, "NULLS"),
2055 (Token::First, "FIRST"),
2056 (Token::Last, "LAST"),
2057 (Token::Limit, "LIMIT"),
2058 (Token::Offset, "OFFSET"),
2059 (Token::Inner, "INNER"),
2060 (Token::Left, "LEFT"),
2061 (Token::Right, "RIGHT"),
2062 (Token::Outer, "OUTER"),
2063 (Token::Full, "FULL"),
2064 (Token::Cross, "CROSS"),
2065 (Token::Starts, "STARTS"),
2066 (Token::Ends, "ENDS"),
2067 (Token::With, "WITH"),
2068 (Token::Contains, "CONTAINS"),
2069 (Token::True, "TRUE"),
2070 (Token::False, "FALSE"),
2071 (Token::Enrich, "ENRICH"),
2072 (Token::Group, "GROUP"),
2073 (Token::Count, "COUNT"),
2074 (Token::Sum, "SUM"),
2075 (Token::Avg, "AVG"),
2076 (Token::Min, "MIN"),
2077 (Token::Max, "MAX"),
2078 (Token::Distinct, "DISTINCT"),
2079 (Token::Vector, "VECTOR"),
2080 (Token::Search, "SEARCH"),
2081 (Token::Similar, "SIMILAR"),
2082 (Token::Collection, "COLLECTION"),
2083 (Token::Metric, "METRIC"),
2084 (Token::Threshold, "THRESHOLD"),
2085 (Token::K, "K"),
2086 (Token::Hybrid, "HYBRID"),
2087 (Token::Fusion, "FUSION"),
2088 (Token::Rerank, "RERANK"),
2089 (Token::Rrf, "RRF"),
2090 (Token::Intersection, "INTERSECTION"),
2091 (Token::Union, "UNION"),
2092 (Token::Recursive, "RECURSIVE"),
2093 (Token::All, "ALL"),
2094 (Token::Weight, "WEIGHT"),
2095 (Token::L2, "L2"),
2096 (Token::Cosine, "COSINE"),
2097 (Token::InnerProduct, "INNER_PRODUCT"),
2098 (Token::Include, "INCLUDE"),
2099 (Token::Metadata, "METADATA"),
2100 (Token::Vectors, "VECTORS"),
2101 (Token::Explain, "EXPLAIN"),
2102 (Token::For, "FOR"),
2103 (Token::Format, "FORMAT"),
2104 (Token::Json, "JSON"),
2105 (Token::Insert, "INSERT"),
2106 (Token::Into, "INTO"),
2107 (Token::Values, "VALUES"),
2108 (Token::Update, "UPDATE"),
2109 (Token::Set, "SET"),
2110 (Token::Delete, "DELETE"),
2111 (Token::Truncate, "TRUNCATE"),
2112 (Token::Create, "CREATE"),
2113 (Token::Table, "TABLE"),
2114 (Token::Drop, "DROP"),
2115 (Token::Alter, "ALTER"),
2116 (Token::Add, "ADD"),
2117 (Token::Column, "COLUMN"),
2118 (Token::Primary, "PRIMARY"),
2119 (Token::Key, "KEY"),
2120 (Token::Default, "DEFAULT"),
2121 (Token::Compress, "COMPRESS"),
2122 (Token::Index, "INDEX"),
2123 (Token::Unique, "UNIQUE"),
2124 (Token::If, "IF"),
2125 (Token::Exists, "EXISTS"),
2126 (Token::Returning, "RETURNING"),
2127 (Token::Cascade, "CASCADE"),
2128 (Token::Rename, "RENAME"),
2129 (Token::Using, "USING"),
2130 (Token::Node, "NODE"),
2131 (Token::Edge, "EDGE"),
2132 (Token::Document, "DOCUMENT"),
2133 (Token::Kv, "KV"),
2134 (Token::Timeseries, "TIMESERIES"),
2135 (Token::Retention, "RETENTION"),
2136 (Token::Queue, "QUEUE"),
2137 (Token::Tree, "TREE"),
2138 (Token::Push, "PUSH"),
2139 (Token::Pop, "POP"),
2140 (Token::Peek, "PEEK"),
2141 (Token::Purge, "PURGE"),
2142 (Token::Ack, "ACK"),
2143 (Token::Nack, "NACK"),
2144 (Token::Priority, "PRIORITY"),
2145 (Token::Neighborhood, "NEIGHBORHOOD"),
2146 (Token::ShortestPath, "SHORTEST_PATH"),
2147 (Token::Centrality, "CENTRALITY"),
2148 (Token::Community, "COMMUNITY"),
2149 (Token::Components, "COMPONENTS"),
2150 (Token::Cycles, "CYCLES"),
2151 (Token::Traverse, "TRAVERSE"),
2152 (Token::Depth, "DEPTH"),
2153 (Token::Direction, "DIRECTION"),
2154 (Token::Algorithm, "ALGORITHM"),
2155 (Token::Strategy, "STRATEGY"),
2156 (Token::MaxIterations, "MAX_ITERATIONS"),
2157 (Token::MaxLength, "MAX_LENGTH"),
2158 (Token::Mode, "MODE"),
2159 (Token::Clustering, "CLUSTERING"),
2160 (Token::TopologicalSort, "TOPOLOGICAL_SORT"),
2161 (Token::Properties, "PROPERTIES"),
2162 (Token::Text, "TEXT"),
2163 (Token::Fuzzy, "FUZZY"),
2164 (Token::MinScore, "MIN_SCORE"),
2165 (Token::Begin, "BEGIN"),
2166 (Token::Commit, "COMMIT"),
2167 (Token::Rollback, "ROLLBACK"),
2168 (Token::Savepoint, "SAVEPOINT"),
2169 (Token::Release, "RELEASE"),
2170 (Token::Start, "START"),
2171 (Token::Transaction, "TRANSACTION"),
2172 (Token::Work, "WORK"),
2173 (Token::Vacuum, "VACUUM"),
2174 (Token::Analyze, "ANALYZE"),
2175 (Token::Schema, "SCHEMA"),
2176 (Token::Sequence, "SEQUENCE"),
2177 (Token::Increment, "INCREMENT"),
2178 (Token::Copy, "COPY"),
2179 (Token::Header, "HEADER"),
2180 (Token::Delimiter, "DELIMITER"),
2181 (Token::View, "VIEW"),
2182 (Token::Materialized, "MATERIALIZED"),
2183 (Token::Refresh, "REFRESH"),
2184 (Token::Partition, "PARTITION"),
2185 (Token::Range, "RANGE"),
2186 (Token::List, "LIST"),
2187 (Token::Hash, "HASH"),
2188 (Token::Attach, "ATTACH"),
2189 (Token::Detach, "DETACH"),
2190 (Token::Of, "OF"),
2191 (Token::Policy, "POLICY"),
2192 (Token::Enable, "ENABLE"),
2193 (Token::Disable, "DISABLE"),
2194 (Token::Security, "SECURITY"),
2195 (Token::Row, "ROW"),
2196 (Token::Level, "LEVEL"),
2197 (Token::Foreign, "FOREIGN"),
2198 (Token::Server, "SERVER"),
2199 (Token::Wrapper, "WRAPPER"),
2200 (Token::Options, "OPTIONS"),
2201 (Token::Data, "DATA"),
2202 (Token::String("x".into()), "'x'"),
2203 (Token::Integer(7), "7"),
2204 (Token::Float(1.5), "1.5"),
2205 (Token::JsonLiteral(r#"{"x":1}"#.into()), r#"{"x":1}"#),
2206 (Token::Ident("id".into()), "id"),
2207 (Token::Eq, "="),
2208 (Token::Ne, "<>"),
2209 (Token::Lt, "<"),
2210 (Token::Le, "<="),
2211 (Token::Gt, ">"),
2212 (Token::Ge, ">="),
2213 (Token::Plus, "+"),
2214 (Token::Minus, "-"),
2215 (Token::Star, "*"),
2216 (Token::Slash, "/"),
2217 (Token::Percent, "%"),
2218 (Token::LParen, "("),
2219 (Token::RParen, ")"),
2220 (Token::LBracket, "["),
2221 (Token::RBracket, "]"),
2222 (Token::LBrace, "{"),
2223 (Token::RBrace, "}"),
2224 (Token::Comma, ","),
2225 (Token::Dot, "."),
2226 (Token::Colon, ":"),
2227 (Token::Semi, ";"),
2228 (Token::Dollar, "$"),
2229 (Token::Arrow, "->"),
2230 (Token::ArrowLeft, "<-"),
2231 (Token::Dash, "-"),
2232 (Token::DotDot, ".."),
2233 (Token::Pipe, "|"),
2234 (Token::DoublePipe, "||"),
2235 (Token::Eof, "EOF"),
2236 ];
2237
2238 for (token, expected) in cases {
2239 assert_eq!(token.to_string(), expected);
2240 }
2241 }
2242
2243 #[test]
2244 fn test_string_escape_and_error_matrix() {
2245 let tokens = tokenize(
2246 r#"'line\nrow' 'carriage\rreturn' 'tab\tstop' 'slash\\' 'quote\'' "dq\"" 'raw\z'"#,
2247 );
2248 assert_eq!(
2249 tokens,
2250 vec![
2251 Token::String("line\nrow".into()),
2252 Token::String("carriage\rreturn".into()),
2253 Token::String("tab\tstop".into()),
2254 Token::String("slash\\".into()),
2255 Token::String("quote'".into()),
2256 Token::String("dq\"".into()),
2257 Token::String(r"raw\z".into()),
2258 Token::Eof
2259 ]
2260 );
2261
2262 let mut lexer = Lexer::new("'unterminated");
2263 assert!(lexer
2264 .next_token()
2265 .unwrap_err()
2266 .message
2267 .contains("Unterminated string"));
2268
2269 let mut lexer = Lexer::new(r"'bad\");
2270 assert!(lexer
2271 .next_token()
2272 .unwrap_err()
2273 .message
2274 .contains("Unterminated string"));
2275 }
2276
2277 #[test]
2278 fn test_operator_comment_peek_limit_and_tokenize_paths() {
2279 let tokens = tokenize("!= % ; $ || | 123.abc 1..2 1e+2 <- -> /* block */ SELECT");
2280 assert_eq!(
2281 tokens,
2282 vec![
2283 Token::Ne,
2284 Token::Percent,
2285 Token::Semi,
2286 Token::Dollar,
2287 Token::DoublePipe,
2288 Token::Pipe,
2289 Token::Integer(123),
2290 Token::Dot,
2291 Token::Ident("abc".into()),
2292 Token::Integer(1),
2293 Token::DotDot,
2294 Token::Integer(2),
2295 Token::Float(1e2),
2296 Token::ArrowLeft,
2297 Token::Arrow,
2298 Token::Select,
2299 Token::Eof,
2300 ]
2301 );
2302
2303 let mut lexer = Lexer::new("SELECT FROM");
2304 assert_eq!(lexer.peek_token().unwrap().token, Token::Select);
2305 assert_eq!(lexer.next_token().unwrap().token, Token::Select);
2306 assert_eq!(lexer.next_token().unwrap().token, Token::From);
2307
2308 let mut lexer = Lexer::new("!");
2309 assert!(lexer
2310 .next_token()
2311 .unwrap_err()
2312 .message
2313 .contains("Expected '=' after '!'"));
2314
2315 let limits = crate::storage::query::parser::ParserLimits {
2316 max_identifier_chars: 3,
2317 ..crate::storage::query::parser::ParserLimits::default()
2318 };
2319 let mut lexer = Lexer::with_limits("abcd", limits);
2320 assert_eq!(lexer.max_identifier_chars(), 3);
2321 let err = lexer.next_token().unwrap_err();
2322 assert!(matches!(
2323 err.limit_hit,
2324 Some(LexerLimitHit::IdentifierTooLong { value: 3, .. })
2325 ));
2326 }
2327}