Skip to main content

reddb_rql/
lexer.rs

1//! RQL Lexer
2//!
3//! Tokenizes RQL (RedDB Query Language) strings for parsing.
4//! Supports both SQL-like table queries and Cypher-like graph patterns.
5//!
6//! # Token Types
7//!
8//! - Keywords: SELECT, FROM, WHERE, MATCH, RETURN, JOIN, GRAPH, PATH, etc.
9//! - Literals: strings, integers, floats, booleans
10//! - Identifiers: table names, column names, aliases
11//! - Operators: comparison, arithmetic, logical
12//! - Graph syntax: arrows (->), edge brackets ([-])
13
14use std::fmt;
15use std::iter::Peekable;
16use std::str::Chars;
17
18/// Token types for RQL
19#[derive(Debug, Clone, PartialEq)]
20pub enum Token {
21    // Keywords
22    Select,
23    From,
24    Where,
25    And,
26    Or,
27    Not,
28    Match,
29    Return,
30    Join,
31    Graph,
32    Path,
33    To,
34    Via,
35    On,
36    As,
37    Is,
38    Null,
39    Between,
40    Like,
41    In,
42    Order,
43    By,
44    Asc,
45    Desc,
46    Nulls,
47    First,
48    Last,
49    Limit,
50    Offset,
51    Inner,
52    Left,
53    Right,
54    Outer,
55    Full,
56    Cross,
57    Starts,
58    Ends,
59    With,
60    Contains,
61    True,
62    False,
63    Enrich,
64    Group,
65    Count,
66    Sum,
67    Avg,
68    Min,
69    Max,
70    Distinct,
71
72    // Vector query keywords
73    Vector,
74    Search,
75    Similar,
76    Collection,
77    Metric,
78    Threshold,
79    K,
80    Hybrid,
81    Fusion,
82    Rerank,
83    Rrf,
84    Intersection,
85    Union,
86    Recursive,
87    All,
88    Weight,
89    L2,
90    Cosine,
91    InnerProduct,
92    Include,
93    Metadata,
94    Vectors,
95
96    // DML/DDL keywords
97    Insert,
98    Into,
99    Values,
100    Update,
101    Set,
102    Delete,
103    Truncate,
104    Create,
105    Table,
106    Drop,
107    Alter,
108    Add,
109    Column,
110    Primary,
111    // EXPLAIN ALTER FOR — schema diff command
112    Explain,
113    For,
114    Format,
115    Json,
116    Key,
117    Default,
118    Compress,
119    Index,
120    Unique,
121    If,
122    Exists,
123    Returning,
124    Cascade,
125    Rename,
126    Using,
127
128    // Entity type keywords
129    Node,
130    Edge,
131    Document,
132    Kv,
133
134    // Time-series & Queue keywords
135    Timeseries,
136    Retention,
137    Queue,
138    Tree,
139    Push,
140    Pop,
141    Peek,
142    Purge,
143    Ack,
144    Nack,
145    Priority,
146
147    // Graph command keywords
148    Neighborhood,
149    ShortestPath,
150    Centrality,
151    Community,
152    Components,
153    Cycles,
154    Traverse,
155    Depth,
156    Direction,
157    Algorithm,
158    Strategy,
159    MaxIterations,
160    MaxLength,
161    Mode,
162    Clustering,
163    TopologicalSort,
164    Properties,
165    Text,
166    Fuzzy,
167    MinScore,
168
169    // Transaction control keywords (Phase 1.1 PG parity)
170    Begin,
171    Commit,
172    Rollback,
173    Savepoint,
174    Release,
175    Start,
176    Transaction,
177    Work,
178
179    // Maintenance keywords (Phase 1.2 PG parity)
180    Vacuum,
181    Analyze,
182
183    // Schema + sequence keywords (Phase 1.3 PG parity)
184    Schema,
185    Sequence,
186    Increment,
187
188    // COPY command keywords (Phase 1.5 PG parity)
189    Copy,
190    Header,
191    Delimiter,
192
193    // View keywords (Phase 2.1 PG parity)
194    View,
195    Materialized,
196    Refresh,
197
198    // Partitioning keywords (Phase 2.2 PG parity)
199    Partition,
200    Range,
201    List,
202    Hash,
203    Attach,
204    Detach,
205    Of,
206
207    // Row Level Security keywords (Phase 2.5 PG parity)
208    Policy,
209    Enable,
210    Disable,
211    Security,
212    Row,
213    Level,
214
215    // Foreign Data Wrapper keywords (Phase 3.2 PG parity)
216    Foreign,
217    Server,
218    Wrapper,
219    Options,
220    Data,
221
222    // SESSIONIZE operator (issue #585 slice 8).
223    Sessionize,
224    Gap,
225
226    // Window OVER-clause keywords (issue #589 slice 7a).
227    // Note: PARTITION, RANGE, BETWEEN, ROW already exist for DDL/predicate
228    // contexts; reused here in window context.
229    Over,
230    Rows,
231    Preceding,
232    Following,
233    Unbounded,
234    Current,
235
236    // Literals
237    String(String),
238    Integer(i64),
239    Float(f64),
240    /// Raw JSON object literal text — produced when the lexer enters JSON
241    /// sub-mode at a `{` whose first non-whitespace inner char is `"`,
242    /// signalling a standard JSON object. The String holds the verbatim
243    /// `{...}` text, including the enclosing braces. The parser hands it
244    /// to `parse_json` to materialise a `Value::Json`. See issue #86.
245    JsonLiteral(String),
246
247    // Identifiers
248    Ident(String),
249
250    // Operators
251    Eq,      // =
252    Ne,      // <> or !=
253    Lt,      // <
254    Le,      // <=
255    Gt,      // >
256    Ge,      // >=
257    Plus,    // +
258    Minus,   // -
259    Star,    // *
260    Slash,   // /
261    Percent, // %
262
263    // Delimiters
264    LParen,   // (
265    RParen,   // )
266    LBracket, // [
267    RBracket, // ]
268    LBrace,   // {
269    RBrace,   // }
270    Comma,    // ,
271    Dot,      // .
272    Colon,    // :
273    Semi,     // ;
274    Dollar,   // $
275    Question, // ?
276
277    // Named-argument syntax (e.g. `louvain(g, resolution => 0.5)`)
278    FatArrow, // =>
279
280    // Graph syntax
281    Arrow,      // ->
282    ArrowLeft,  // <-
283    Dash,       // -
284    DotDot,     // ..
285    Pipe,       // |
286    DoublePipe, // ||
287
288    // End of input
289    Eof,
290}
291
292impl fmt::Display for Token {
293    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
294        match self {
295            Token::Select => write!(f, "SELECT"),
296            Token::From => write!(f, "FROM"),
297            Token::Where => write!(f, "WHERE"),
298            Token::And => write!(f, "AND"),
299            Token::Or => write!(f, "OR"),
300            Token::Not => write!(f, "NOT"),
301            Token::Match => write!(f, "MATCH"),
302            Token::Return => write!(f, "RETURN"),
303            Token::Join => write!(f, "JOIN"),
304            Token::Graph => write!(f, "GRAPH"),
305            Token::Path => write!(f, "PATH"),
306            Token::To => write!(f, "TO"),
307            Token::Via => write!(f, "VIA"),
308            Token::On => write!(f, "ON"),
309            Token::As => write!(f, "AS"),
310            Token::Is => write!(f, "IS"),
311            Token::Null => write!(f, "NULL"),
312            Token::Between => write!(f, "BETWEEN"),
313            Token::Like => write!(f, "LIKE"),
314            Token::In => write!(f, "IN"),
315            Token::Order => write!(f, "ORDER"),
316            Token::By => write!(f, "BY"),
317            Token::Asc => write!(f, "ASC"),
318            Token::Desc => write!(f, "DESC"),
319            Token::Nulls => write!(f, "NULLS"),
320            Token::First => write!(f, "FIRST"),
321            Token::Last => write!(f, "LAST"),
322            Token::Limit => write!(f, "LIMIT"),
323            Token::Offset => write!(f, "OFFSET"),
324            Token::Inner => write!(f, "INNER"),
325            Token::Left => write!(f, "LEFT"),
326            Token::Right => write!(f, "RIGHT"),
327            Token::Outer => write!(f, "OUTER"),
328            Token::Full => write!(f, "FULL"),
329            Token::Cross => write!(f, "CROSS"),
330            Token::Starts => write!(f, "STARTS"),
331            Token::Ends => write!(f, "ENDS"),
332            Token::With => write!(f, "WITH"),
333            Token::Contains => write!(f, "CONTAINS"),
334            Token::True => write!(f, "TRUE"),
335            Token::False => write!(f, "FALSE"),
336            Token::Enrich => write!(f, "ENRICH"),
337            Token::Group => write!(f, "GROUP"),
338            Token::Count => write!(f, "COUNT"),
339            Token::Sum => write!(f, "SUM"),
340            Token::Avg => write!(f, "AVG"),
341            Token::Min => write!(f, "MIN"),
342            Token::Max => write!(f, "MAX"),
343            Token::Distinct => write!(f, "DISTINCT"),
344            Token::Vector => write!(f, "VECTOR"),
345            Token::Search => write!(f, "SEARCH"),
346            Token::Similar => write!(f, "SIMILAR"),
347            Token::Collection => write!(f, "COLLECTION"),
348            Token::Metric => write!(f, "METRIC"),
349            Token::Threshold => write!(f, "THRESHOLD"),
350            Token::K => write!(f, "K"),
351            Token::Hybrid => write!(f, "HYBRID"),
352            Token::Fusion => write!(f, "FUSION"),
353            Token::Rerank => write!(f, "RERANK"),
354            Token::Rrf => write!(f, "RRF"),
355            Token::Intersection => write!(f, "INTERSECTION"),
356            Token::Union => write!(f, "UNION"),
357            Token::Recursive => write!(f, "RECURSIVE"),
358            Token::All => write!(f, "ALL"),
359            Token::Weight => write!(f, "WEIGHT"),
360            Token::L2 => write!(f, "L2"),
361            Token::Cosine => write!(f, "COSINE"),
362            Token::InnerProduct => write!(f, "INNER_PRODUCT"),
363            Token::Include => write!(f, "INCLUDE"),
364            Token::Metadata => write!(f, "METADATA"),
365            Token::Vectors => write!(f, "VECTORS"),
366            Token::Explain => write!(f, "EXPLAIN"),
367            Token::For => write!(f, "FOR"),
368            Token::Format => write!(f, "FORMAT"),
369            Token::Json => write!(f, "JSON"),
370            Token::Insert => write!(f, "INSERT"),
371            Token::Into => write!(f, "INTO"),
372            Token::Values => write!(f, "VALUES"),
373            Token::Update => write!(f, "UPDATE"),
374            Token::Set => write!(f, "SET"),
375            Token::Delete => write!(f, "DELETE"),
376            Token::Truncate => write!(f, "TRUNCATE"),
377            Token::Create => write!(f, "CREATE"),
378            Token::Table => write!(f, "TABLE"),
379            Token::Drop => write!(f, "DROP"),
380            Token::Alter => write!(f, "ALTER"),
381            Token::Add => write!(f, "ADD"),
382            Token::Column => write!(f, "COLUMN"),
383            Token::Primary => write!(f, "PRIMARY"),
384            Token::Key => write!(f, "KEY"),
385            Token::Default => write!(f, "DEFAULT"),
386            Token::Compress => write!(f, "COMPRESS"),
387            Token::Index => write!(f, "INDEX"),
388            Token::Unique => write!(f, "UNIQUE"),
389            Token::If => write!(f, "IF"),
390            Token::Exists => write!(f, "EXISTS"),
391            Token::Returning => write!(f, "RETURNING"),
392            Token::Cascade => write!(f, "CASCADE"),
393            Token::Rename => write!(f, "RENAME"),
394            Token::Using => write!(f, "USING"),
395            Token::Node => write!(f, "NODE"),
396            Token::Edge => write!(f, "EDGE"),
397            Token::Document => write!(f, "DOCUMENT"),
398            Token::Kv => write!(f, "KV"),
399            Token::Timeseries => write!(f, "TIMESERIES"),
400            Token::Retention => write!(f, "RETENTION"),
401            Token::Queue => write!(f, "QUEUE"),
402            Token::Tree => write!(f, "TREE"),
403            Token::Push => write!(f, "PUSH"),
404            Token::Pop => write!(f, "POP"),
405            Token::Peek => write!(f, "PEEK"),
406            Token::Purge => write!(f, "PURGE"),
407            Token::Ack => write!(f, "ACK"),
408            Token::Nack => write!(f, "NACK"),
409            Token::Priority => write!(f, "PRIORITY"),
410            Token::Neighborhood => write!(f, "NEIGHBORHOOD"),
411            Token::ShortestPath => write!(f, "SHORTEST_PATH"),
412            Token::Centrality => write!(f, "CENTRALITY"),
413            Token::Community => write!(f, "COMMUNITY"),
414            Token::Components => write!(f, "COMPONENTS"),
415            Token::Cycles => write!(f, "CYCLES"),
416            Token::Traverse => write!(f, "TRAVERSE"),
417            Token::Depth => write!(f, "DEPTH"),
418            Token::Direction => write!(f, "DIRECTION"),
419            Token::Algorithm => write!(f, "ALGORITHM"),
420            Token::Strategy => write!(f, "STRATEGY"),
421            Token::MaxIterations => write!(f, "MAX_ITERATIONS"),
422            Token::MaxLength => write!(f, "MAX_LENGTH"),
423            Token::Mode => write!(f, "MODE"),
424            Token::Clustering => write!(f, "CLUSTERING"),
425            Token::TopologicalSort => write!(f, "TOPOLOGICAL_SORT"),
426            Token::Properties => write!(f, "PROPERTIES"),
427            Token::Text => write!(f, "TEXT"),
428            Token::Fuzzy => write!(f, "FUZZY"),
429            Token::MinScore => write!(f, "MIN_SCORE"),
430            Token::Begin => write!(f, "BEGIN"),
431            Token::Commit => write!(f, "COMMIT"),
432            Token::Rollback => write!(f, "ROLLBACK"),
433            Token::Savepoint => write!(f, "SAVEPOINT"),
434            Token::Release => write!(f, "RELEASE"),
435            Token::Start => write!(f, "START"),
436            Token::Transaction => write!(f, "TRANSACTION"),
437            Token::Work => write!(f, "WORK"),
438            Token::Vacuum => write!(f, "VACUUM"),
439            Token::Analyze => write!(f, "ANALYZE"),
440            Token::Schema => write!(f, "SCHEMA"),
441            Token::Sequence => write!(f, "SEQUENCE"),
442            Token::Increment => write!(f, "INCREMENT"),
443            Token::Copy => write!(f, "COPY"),
444            Token::Header => write!(f, "HEADER"),
445            Token::Delimiter => write!(f, "DELIMITER"),
446            Token::View => write!(f, "VIEW"),
447            Token::Materialized => write!(f, "MATERIALIZED"),
448            Token::Refresh => write!(f, "REFRESH"),
449            Token::Partition => write!(f, "PARTITION"),
450            Token::Range => write!(f, "RANGE"),
451            Token::List => write!(f, "LIST"),
452            Token::Hash => write!(f, "HASH"),
453            Token::Attach => write!(f, "ATTACH"),
454            Token::Detach => write!(f, "DETACH"),
455            Token::Of => write!(f, "OF"),
456            Token::Policy => write!(f, "POLICY"),
457            Token::Enable => write!(f, "ENABLE"),
458            Token::Disable => write!(f, "DISABLE"),
459            Token::Security => write!(f, "SECURITY"),
460            Token::Row => write!(f, "ROW"),
461            Token::Level => write!(f, "LEVEL"),
462            Token::Foreign => write!(f, "FOREIGN"),
463            Token::Server => write!(f, "SERVER"),
464            Token::Wrapper => write!(f, "WRAPPER"),
465            Token::Options => write!(f, "OPTIONS"),
466            Token::Data => write!(f, "DATA"),
467            Token::Sessionize => write!(f, "SESSIONIZE"),
468            Token::Gap => write!(f, "GAP"),
469            Token::Over => write!(f, "OVER"),
470            Token::Rows => write!(f, "ROWS"),
471            Token::Preceding => write!(f, "PRECEDING"),
472            Token::Following => write!(f, "FOLLOWING"),
473            Token::Unbounded => write!(f, "UNBOUNDED"),
474            Token::Current => write!(f, "CURRENT"),
475            Token::String(s) => write!(f, "'{}'", s),
476            Token::Integer(n) => write!(f, "{}", n),
477            Token::Float(n) => write!(f, "{}", n),
478            Token::JsonLiteral(s) => write!(f, "{}", s),
479            Token::Ident(s) => write!(f, "{}", s),
480            Token::Eq => write!(f, "="),
481            Token::Ne => write!(f, "<>"),
482            Token::Lt => write!(f, "<"),
483            Token::Le => write!(f, "<="),
484            Token::Gt => write!(f, ">"),
485            Token::Ge => write!(f, ">="),
486            Token::Plus => write!(f, "+"),
487            Token::Minus => write!(f, "-"),
488            Token::Star => write!(f, "*"),
489            Token::Slash => write!(f, "/"),
490            Token::Percent => write!(f, "%"),
491            Token::LParen => write!(f, "("),
492            Token::RParen => write!(f, ")"),
493            Token::LBracket => write!(f, "["),
494            Token::RBracket => write!(f, "]"),
495            Token::LBrace => write!(f, "{{"),
496            Token::RBrace => write!(f, "}}"),
497            Token::Comma => write!(f, ","),
498            Token::Dot => write!(f, "."),
499            Token::Colon => write!(f, ":"),
500            Token::Semi => write!(f, ";"),
501            Token::Dollar => write!(f, "$"),
502            Token::Question => write!(f, "?"),
503            Token::FatArrow => write!(f, "=>"),
504            Token::Arrow => write!(f, "->"),
505            Token::ArrowLeft => write!(f, "<-"),
506            Token::Dash => write!(f, "-"),
507            Token::DotDot => write!(f, ".."),
508            Token::Pipe => write!(f, "|"),
509            Token::DoublePipe => write!(f, "||"),
510            Token::Eof => write!(f, "EOF"),
511        }
512    }
513}
514
515/// Position in source code
516#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
517pub struct Position {
518    /// Line number (1-indexed)
519    pub line: u32,
520    /// Column number (1-indexed)
521    pub column: u32,
522    /// Byte offset from start
523    pub offset: u32,
524}
525
526impl Position {
527    /// Create a new position
528    pub fn new(line: u32, column: u32, offset: u32) -> Self {
529        Self {
530            line,
531            column,
532            offset,
533        }
534    }
535}
536
537impl fmt::Display for Position {
538    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
539        write!(f, "{}:{}", self.line, self.column)
540    }
541}
542
543/// A token with its position in source
544#[derive(Debug, Clone)]
545pub struct Spanned {
546    /// The token
547    pub token: Token,
548    /// Start position
549    pub start: Position,
550    /// End position
551    pub end: Position,
552}
553
554impl Spanned {
555    /// Create a new spanned token
556    pub fn new(token: Token, start: Position, end: Position) -> Self {
557        Self { token, start, end }
558    }
559}
560
561/// Lexer error
562#[derive(Debug, Clone)]
563pub struct LexerError {
564    /// Error message
565    pub message: String,
566    /// Position where error occurred
567    pub position: Position,
568    /// Optional structured DoS-limit annotation. When set, the
569    /// `From<LexerError> for ParseError` conversion preserves this
570    /// kind so callers can pattern-match on the limit programmatically.
571    pub limit_hit: Option<LexerLimitHit>,
572}
573
574/// A specific DoS limit that the lexer refused to cross.
575#[derive(Debug, Clone, PartialEq, Eq)]
576pub enum LexerLimitHit {
577    /// Identifier length cap.
578    IdentifierTooLong {
579        limit_name: &'static str,
580        value: usize,
581    },
582}
583
584impl LexerError {
585    /// Create a new lexer error
586    pub fn new(message: impl Into<String>, position: Position) -> Self {
587        Self {
588            message: message.into(),
589            position,
590            limit_hit: None,
591        }
592    }
593
594    /// Create a lexer error tagged with a structured limit-hit kind.
595    pub(crate) fn with_limit(
596        message: impl Into<String>,
597        position: Position,
598        limit_hit: LexerLimitHit,
599    ) -> Self {
600        Self {
601            message: message.into(),
602            position,
603            limit_hit: Some(limit_hit),
604        }
605    }
606}
607
608impl fmt::Display for LexerError {
609    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
610        write!(f, "Lexer error at {}: {}", self.position, self.message)
611    }
612}
613
614impl std::error::Error for LexerError {}
615
616/// Maximum byte size of a raw JSON object literal. Mirrors the redwire
617/// frame ceiling (`MAX_FRAME_SIZE` = 16 MiB) so a single SQL statement
618/// can never embed a JSON literal larger than the wire payload limit.
619/// Wire-side limit lives in `reddb_wire::redwire::MAX_FRAME_SIZE`;
620/// duplicated here as a parser-side guard so the lexer can fail fast
621/// without depending on the wire crate.
622pub const JSON_LITERAL_MAX_BYTES: usize = 16 * 1024 * 1024;
623
624/// RQL Lexer
625pub struct Lexer<'a> {
626    /// Original input text — kept so the JSON sub-mode can slice raw
627    /// bytes between `{` and the matching `}` without re-tokenising.
628    input: &'a str,
629    /// Input characters
630    chars: Peekable<Chars<'a>>,
631    /// Current position
632    line: u32,
633    column: u32,
634    offset: u32,
635    /// Peeked token
636    peeked: Option<Spanned>,
637    /// Put-back buffer for characters we need to "unconsume"
638    putback: Option<(char, Position)>,
639    /// Maximum identifier characters (DoS limit, issue #87).
640    max_identifier_chars: usize,
641}
642
643impl<'a> Lexer<'a> {
644    /// Create a new lexer for the given input
645    pub fn new(input: &'a str) -> Self {
646        Self::with_limits(input, crate::limits::ParserLimits::default())
647    }
648
649    /// Create a new lexer with explicit DoS limits.
650    pub fn with_limits(input: &'a str, limits: crate::limits::ParserLimits) -> Self {
651        Self {
652            input,
653            chars: input.chars().peekable(),
654            line: 1,
655            column: 1,
656            offset: 0,
657            peeked: None,
658            putback: None,
659            max_identifier_chars: limits.max_identifier_chars,
660        }
661    }
662
663    /// Maximum identifier-length cap; queried by `scan_identifier`
664    /// to bail with a structured `LexerError` when an identifier
665    /// would exceed the configured cap.
666    ///
667    /// Exercised only by the embedded unit tests (production code reads the
668    /// `max_identifier_chars` field directly); kept verbatim from the
669    /// pre-extraction server module, where a crate-level `allow(dead_code)`
670    /// masked it. Move-only per #1102.
671    #[allow(dead_code)]
672    pub(crate) fn max_identifier_chars(&self) -> usize {
673        self.max_identifier_chars
674    }
675
676    /// Get current position
677    fn position(&self) -> Position {
678        Position::new(self.line, self.column, self.offset)
679    }
680
681    /// Put a character back into the stream
682    fn unget(&mut self, ch: char, pos: Position) {
683        self.putback = Some((ch, pos));
684    }
685
686    /// Advance and get next character
687    fn advance(&mut self) -> Option<char> {
688        // Check putback buffer first
689        if let Some((ch, pos)) = self.putback.take() {
690            // When we re-consume from putback, update position to after the char
691            self.line = pos.line;
692            self.column = pos.column + 1;
693            self.offset = pos.offset + ch.len_utf8() as u32;
694            return Some(ch);
695        }
696
697        let ch = self.chars.next()?;
698        self.offset += ch.len_utf8() as u32;
699        if ch == '\n' {
700            self.line += 1;
701            self.column = 1;
702        } else {
703            self.column += 1;
704        }
705        Some(ch)
706    }
707
708    /// Peek at next character
709    fn peek(&mut self) -> Option<char> {
710        // Check putback buffer first
711        if let Some((ch, _)) = &self.putback {
712            return Some(*ch);
713        }
714        self.chars.peek().copied()
715    }
716
717    /// Skip whitespace and comments
718    ///
719    /// Superseded by `skip_whitespace_simple`; retained verbatim from the
720    /// pre-extraction server module (where a crate-level `allow(dead_code)`
721    /// masked it). Kept untouched per the move-only contract of #1102.
722    #[allow(dead_code)]
723    fn skip_whitespace(&mut self) {
724        while let Some(ch) = self.peek() {
725            if ch.is_whitespace() {
726                self.advance();
727            } else if ch == '-' {
728                // Could be comment (--) or operator
729                let pos = self.position();
730                self.advance();
731                if self.peek() == Some('-') {
732                    // Line comment
733                    self.advance();
734                    while let Some(c) = self.peek() {
735                        if c == '\n' {
736                            break;
737                        }
738                        self.advance();
739                    }
740                } else {
741                    // Not a comment, put back - by restoring state
742                    // Since we can't put back, we'll handle this in next_token
743                    self.line = pos.line;
744                    self.column = pos.column;
745                    self.offset = pos.offset;
746                    // Need to reset chars iterator - this is tricky
747                    // Instead, we'll handle -- in scan_operator
748                    break;
749                }
750            } else {
751                break;
752            }
753        }
754    }
755
756    /// Peek at the next token without consuming it
757    pub fn peek_token(&mut self) -> Result<&Spanned, LexerError> {
758        if self.peeked.is_none() {
759            self.peeked = Some(self.next_token_internal()?);
760        }
761        Ok(self.peeked.as_ref().unwrap())
762    }
763
764    /// Get the next token
765    pub fn next_token(&mut self) -> Result<Spanned, LexerError> {
766        if let Some(tok) = self.peeked.take() {
767            return Ok(tok);
768        }
769        self.next_token_internal()
770    }
771
772    /// Internal implementation of next_token
773    fn next_token_internal(&mut self) -> Result<Spanned, LexerError> {
774        self.skip_whitespace_simple();
775
776        let start = self.position();
777
778        let ch = match self.peek() {
779            Some(c) => c,
780            None => {
781                return Ok(Spanned::new(Token::Eof, start, start));
782            }
783        };
784
785        // Dispatch based on first character
786        let token = match ch {
787            // String literals
788            '\'' | '"' => self.scan_string()?,
789
790            // Numbers
791            '0'..='9' => self.scan_number()?,
792
793            // Identifiers and keywords
794            'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier()?,
795
796            // Operators and delimiters
797            '=' => {
798                self.advance();
799                // `=>` is the named-argument arrow (e.g. `resolution => 0.5`);
800                // a bare `=` stays the equality operator.
801                if self.peek() == Some('>') {
802                    self.advance();
803                    Token::FatArrow
804                } else {
805                    Token::Eq
806                }
807            }
808            '<' => self.scan_less_than()?,
809            '>' => self.scan_greater_than()?,
810            '!' => {
811                self.advance();
812                if self.peek() == Some('=') {
813                    self.advance();
814                    Token::Ne
815                } else {
816                    return Err(LexerError::new("Expected '=' after '!'", start));
817                }
818            }
819            '+' => {
820                self.advance();
821                Token::Plus
822            }
823            '-' => self.scan_minus()?,
824            '*' => {
825                self.advance();
826                Token::Star
827            }
828            '/' => {
829                self.advance();
830                Token::Slash
831            }
832            '%' => {
833                self.advance();
834                Token::Percent
835            }
836            '(' => {
837                self.advance();
838                Token::LParen
839            }
840            ')' => {
841                self.advance();
842                Token::RParen
843            }
844            '[' => {
845                self.advance();
846                Token::LBracket
847            }
848            ']' => {
849                self.advance();
850                Token::RBracket
851            }
852            '{' => {
853                // JSON sub-mode trigger: if the next non-whitespace char
854                // after `{` is `"`, scan a balanced raw `{...}` and emit
855                // `Token::JsonLiteral`. Otherwise fall through to the
856                // legacy `LBrace` token (Cypher property bag, etc.).
857                // The empty-object case `{}` also takes the JSON path so
858                // bare `VALUES ({})` matches `VALUES ('{}')`.
859                if self.looks_like_json_object_start() {
860                    return self.scan_json_literal(start);
861                }
862                self.advance();
863                Token::LBrace
864            }
865            '}' => {
866                self.advance();
867                Token::RBrace
868            }
869            ',' => {
870                self.advance();
871                Token::Comma
872            }
873            '.' => self.scan_dot()?,
874            ':' => {
875                self.advance();
876                Token::Colon
877            }
878            ';' => {
879                self.advance();
880                Token::Semi
881            }
882            '$' => {
883                self.advance();
884                Token::Dollar
885            }
886            '?' => {
887                self.advance();
888                Token::Question
889            }
890            '|' => {
891                self.advance();
892                if self.peek() == Some('|') {
893                    self.advance();
894                    Token::DoublePipe
895                } else {
896                    Token::Pipe
897                }
898            }
899            _ => {
900                return Err(LexerError::new(
901                    format!("Unexpected character: '{}'", ch),
902                    start,
903                ));
904            }
905        };
906
907        let end = self.position();
908        Ok(Spanned::new(token, start, end))
909    }
910
911    /// Simple whitespace skip (no comment handling to avoid complexity)
912    fn skip_whitespace_simple(&mut self) {
913        while let Some(ch) = self.peek() {
914            if ch.is_whitespace() {
915                self.advance();
916            } else if ch == '-' && self.input[self.offset as usize..].starts_with("--") {
917                self.advance();
918                self.advance();
919                while let Some(c) = self.peek() {
920                    if c == '\n' {
921                        break;
922                    }
923                    self.advance();
924                }
925            } else if ch == '/' && self.input[self.offset as usize..].starts_with("/*") {
926                self.advance();
927                self.advance();
928                while let Some(c) = self.peek() {
929                    self.advance();
930                    if c == '*' && self.peek() == Some('/') {
931                        self.advance();
932                        break;
933                    }
934                }
935            } else {
936                break;
937            }
938        }
939    }
940
941    /// Scan a string literal
942    fn scan_string(&mut self) -> Result<Token, LexerError> {
943        let quote = self.advance().unwrap(); // ' or "
944        let start = self.position();
945        let mut value = String::new();
946
947        loop {
948            match self.peek() {
949                None => {
950                    return Err(LexerError::new("Unterminated string", start));
951                }
952                Some(c) if c == quote => {
953                    self.advance();
954                    // Check for escaped quote ('')
955                    if self.peek() == Some(quote) {
956                        self.advance();
957                        value.push(quote);
958                    } else {
959                        break;
960                    }
961                }
962                Some('\\') => {
963                    self.advance();
964                    match self.peek() {
965                        Some('n') => {
966                            self.advance();
967                            value.push('\n');
968                        }
969                        Some('r') => {
970                            self.advance();
971                            value.push('\r');
972                        }
973                        Some('t') => {
974                            self.advance();
975                            value.push('\t');
976                        }
977                        Some('\\') => {
978                            self.advance();
979                            value.push('\\');
980                        }
981                        Some(c) if c == quote => {
982                            self.advance();
983                            value.push(quote);
984                        }
985                        Some(c) => {
986                            // Unknown escape, keep as-is
987                            value.push('\\');
988                            value.push(c);
989                            self.advance();
990                        }
991                        None => {
992                            return Err(LexerError::new("Unterminated string", start));
993                        }
994                    }
995                }
996                Some(c) => {
997                    self.advance();
998                    value.push(c);
999                }
1000            }
1001        }
1002
1003        Ok(Token::String(value))
1004    }
1005
1006    /// Scan a number (integer or float)
1007    fn scan_number(&mut self) -> Result<Token, LexerError> {
1008        let mut value = String::new();
1009        let mut is_float = false;
1010
1011        // Integer part
1012        while let Some(ch) = self.peek() {
1013            if ch.is_ascii_digit() {
1014                value.push(ch);
1015                self.advance();
1016            } else {
1017                break;
1018            }
1019        }
1020
1021        // Check for decimal point
1022        if self.peek() == Some('.') {
1023            // Look ahead to distinguish from .. and method calls
1024            let dot_pos = self.position();
1025            self.advance(); // consume the first '.'
1026
1027            if self.peek() == Some('.') {
1028                // It's ".." - put back the first dot using unget
1029                self.unget('.', dot_pos);
1030                // Return integer without the dot
1031            } else if self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
1032                is_float = true;
1033                value.push('.');
1034                while let Some(ch) = self.peek() {
1035                    if ch.is_ascii_digit() {
1036                        value.push(ch);
1037                        self.advance();
1038                    } else {
1039                        break;
1040                    }
1041                }
1042            } else {
1043                // Just a dot after number (like `x.method`), put it back
1044                self.unget('.', dot_pos);
1045            }
1046        }
1047
1048        // Check for exponent
1049        if self.peek() == Some('e') || self.peek() == Some('E') {
1050            is_float = true;
1051            value.push(self.advance().unwrap());
1052
1053            if self.peek() == Some('+') || self.peek() == Some('-') {
1054                value.push(self.advance().unwrap());
1055            }
1056
1057            while let Some(ch) = self.peek() {
1058                if ch.is_ascii_digit() {
1059                    value.push(ch);
1060                    self.advance();
1061                } else {
1062                    break;
1063                }
1064            }
1065        }
1066
1067        if is_float {
1068            match value.parse::<f64>() {
1069                Ok(n) => Ok(Token::Float(n)),
1070                Err(_) => Err(LexerError::new(
1071                    format!("Invalid float: {}", value),
1072                    self.position(),
1073                )),
1074            }
1075        } else {
1076            match value.parse::<i64>() {
1077                Ok(n) => Ok(Token::Integer(n)),
1078                Err(_) => Err(LexerError::new(
1079                    format!("Invalid integer: {}", value),
1080                    self.position(),
1081                )),
1082            }
1083        }
1084    }
1085
1086    /// Scan an identifier or keyword
1087    fn scan_identifier(&mut self) -> Result<Token, LexerError> {
1088        let start_pos = self.position();
1089        let mut value = String::new();
1090        let max = self.max_identifier_chars;
1091
1092        while let Some(ch) = self.peek() {
1093            if ch.is_alphanumeric() || ch == '_' {
1094                if value.chars().count() >= max {
1095                    // Bail before pushing — every additional char is
1096                    // bounded work for the attacker (1 char of input
1097                    // = 1 byte of allocation), so refuse early.
1098                    return Err(LexerError::with_limit(
1099                        format!(
1100                            "identifier exceeds maximum length (max_identifier_chars = {})",
1101                            max
1102                        ),
1103                        start_pos,
1104                        LexerLimitHit::IdentifierTooLong {
1105                            limit_name: "max_identifier_chars",
1106                            value: max,
1107                        },
1108                    ));
1109                }
1110                value.push(ch);
1111                self.advance();
1112            } else {
1113                break;
1114            }
1115        }
1116
1117        // Check for keywords (case-insensitive)
1118        let token = match value.to_uppercase().as_str() {
1119            "SELECT" => Token::Select,
1120            "FROM" => Token::From,
1121            "WHERE" => Token::Where,
1122            "AND" => Token::And,
1123            "OR" => Token::Or,
1124            "NOT" => Token::Not,
1125            "MATCH" => Token::Match,
1126            "RETURN" => Token::Return,
1127            "JOIN" => Token::Join,
1128            "GRAPH" => Token::Graph,
1129            "PATH" => Token::Path,
1130            "TO" => Token::To,
1131            "VIA" => Token::Via,
1132            "ON" => Token::On,
1133            "AS" => Token::As,
1134            "IS" => Token::Is,
1135            "NULL" => Token::Null,
1136            "BETWEEN" => Token::Between,
1137            "LIKE" => Token::Like,
1138            "IN" => Token::In,
1139            "ORDER" => Token::Order,
1140            "BY" => Token::By,
1141            "ASC" => Token::Asc,
1142            "DESC" => Token::Desc,
1143            "NULLS" => Token::Nulls,
1144            "FIRST" => Token::First,
1145            "LAST" => Token::Last,
1146            "LIMIT" => Token::Limit,
1147            "OFFSET" => Token::Offset,
1148            "INNER" => Token::Inner,
1149            "LEFT" => Token::Left,
1150            "RIGHT" => Token::Right,
1151            "OUTER" => Token::Outer,
1152            "FULL" => Token::Full,
1153            "CROSS" => Token::Cross,
1154            "STARTS" => Token::Starts,
1155            "ENDS" => Token::Ends,
1156            "WITH" => Token::With,
1157            "CONTAINS" => Token::Contains,
1158            "TRUE" => Token::True,
1159            "FALSE" => Token::False,
1160            "ENRICH" => Token::Enrich,
1161            "GROUP" => Token::Group,
1162            "COUNT" => Token::Count,
1163            "SUM" => Token::Sum,
1164            "AVG" => Token::Avg,
1165            "MIN" => Token::Min,
1166            "MAX" => Token::Max,
1167            "DISTINCT" => Token::Distinct,
1168            "VECTOR" => Token::Vector,
1169            "SEARCH" => Token::Search,
1170            "SIMILAR" => Token::Similar,
1171            "COLLECTION" => Token::Collection,
1172            "METRIC" => Token::Metric,
1173            "THRESHOLD" => Token::Threshold,
1174            "K" => Token::K,
1175            "HYBRID" => Token::Hybrid,
1176            "FUSION" => Token::Fusion,
1177            "RERANK" => Token::Rerank,
1178            "RRF" => Token::Rrf,
1179            "INTERSECTION" => Token::Intersection,
1180            "UNION" => Token::Union,
1181            "RECURSIVE" => Token::Recursive,
1182            "ALL" => Token::All,
1183            "WEIGHT" => Token::Weight,
1184            "L2" => Token::L2,
1185            "COSINE" => Token::Cosine,
1186            "INNER_PRODUCT" | "INNERPRODUCT" => Token::InnerProduct,
1187            "INCLUDE" => Token::Include,
1188            "METADATA" => Token::Metadata,
1189            "VECTORS" => Token::Vectors,
1190            "EXPLAIN" => Token::Explain,
1191            "FOR" => Token::For,
1192            "FORMAT" => Token::Format,
1193            "JSON" => Token::Json,
1194            "INSERT" => Token::Insert,
1195            "INTO" => Token::Into,
1196            "VALUES" => Token::Values,
1197            "UPDATE" => Token::Update,
1198            "SET" => Token::Set,
1199            "DELETE" => Token::Delete,
1200            "TRUNCATE" => Token::Truncate,
1201            "CREATE" => Token::Create,
1202            "TABLE" => Token::Table,
1203            "DROP" => Token::Drop,
1204            "ALTER" => Token::Alter,
1205            "ADD" => Token::Add,
1206            "COLUMN" => Token::Column,
1207            "PRIMARY" => Token::Primary,
1208            "KEY" => Token::Key,
1209            "DEFAULT" => Token::Default,
1210            "COMPRESS" => Token::Compress,
1211            "INDEX" => Token::Index,
1212            "UNIQUE" => Token::Unique,
1213            "IF" => Token::If,
1214            "EXISTS" => Token::Exists,
1215            "RETURNING" => Token::Returning,
1216            "CASCADE" => Token::Cascade,
1217            "RENAME" => Token::Rename,
1218            "USING" => Token::Using,
1219            "NODE" => Token::Node,
1220            "EDGE" => Token::Edge,
1221            "DOCUMENT" => Token::Document,
1222            "KV" => Token::Kv,
1223            "TIMESERIES" => Token::Timeseries,
1224            "RETENTION" => Token::Retention,
1225            "QUEUE" => Token::Queue,
1226            "TREE" => Token::Tree,
1227            "PUSH" => Token::Push,
1228            "POP" => Token::Pop,
1229            "PEEK" => Token::Peek,
1230            "PURGE" => Token::Purge,
1231            "ACK" => Token::Ack,
1232            "NACK" => Token::Nack,
1233            "PRIORITY" => Token::Priority,
1234            "LPUSH" => Token::Ident("LPUSH".to_string()),
1235            "RPUSH" => Token::Ident("RPUSH".to_string()),
1236            "LPOP" => Token::Ident("LPOP".to_string()),
1237            "RPOP" => Token::Ident("RPOP".to_string()),
1238            "NEIGHBORHOOD" => Token::Neighborhood,
1239            "SHORTEST_PATH" | "SHORTESTPATH" => Token::ShortestPath,
1240            "CENTRALITY" => Token::Centrality,
1241            "COMMUNITY" => Token::Community,
1242            "COMPONENTS" => Token::Components,
1243            "CYCLES" => Token::Cycles,
1244            "TRAVERSE" => Token::Traverse,
1245            "DEPTH" => Token::Depth,
1246            "DIRECTION" => Token::Direction,
1247            "ALGORITHM" => Token::Algorithm,
1248            "STRATEGY" => Token::Strategy,
1249            "MAX_ITERATIONS" | "MAXITERATIONS" => Token::MaxIterations,
1250            "MAX_LENGTH" | "MAXLENGTH" => Token::MaxLength,
1251            "MODE" => Token::Mode,
1252            "CLUSTERING" => Token::Clustering,
1253            "TOPOLOGICAL_SORT" | "TOPOLOGICALSORT" => Token::TopologicalSort,
1254            "PROPERTIES" => Token::Properties,
1255            "TEXT" => Token::Text,
1256            "FUZZY" => Token::Fuzzy,
1257            "MIN_SCORE" | "MINSCORE" => Token::MinScore,
1258            "BEGIN" => Token::Begin,
1259            "COMMIT" => Token::Commit,
1260            "ROLLBACK" => Token::Rollback,
1261            "SAVEPOINT" => Token::Savepoint,
1262            "RELEASE" => Token::Release,
1263            "START" => Token::Start,
1264            "TRANSACTION" => Token::Transaction,
1265            "WORK" => Token::Work,
1266            "VACUUM" => Token::Vacuum,
1267            "ANALYZE" => Token::Analyze,
1268            "SCHEMA" => Token::Schema,
1269            "SEQUENCE" => Token::Sequence,
1270            "INCREMENT" => Token::Increment,
1271            "COPY" => Token::Copy,
1272            "HEADER" => Token::Header,
1273            "DELIMITER" => Token::Delimiter,
1274            "VIEW" => Token::View,
1275            "MATERIALIZED" => Token::Materialized,
1276            "REFRESH" => Token::Refresh,
1277            "PARTITION" => Token::Partition,
1278            "RANGE" => Token::Range,
1279            "LIST" => Token::List,
1280            "HASH" => Token::Hash,
1281            "ATTACH" => Token::Attach,
1282            "DETACH" => Token::Detach,
1283            "OF" => Token::Of,
1284            "POLICY" => Token::Policy,
1285            "ENABLE" => Token::Enable,
1286            "DISABLE" => Token::Disable,
1287            "SECURITY" => Token::Security,
1288            "ROW" => Token::Row,
1289            "LEVEL" => Token::Level,
1290            "FOREIGN" => Token::Foreign,
1291            "SERVER" => Token::Server,
1292            "WRAPPER" => Token::Wrapper,
1293            "OPTIONS" => Token::Options,
1294            "DATA" => Token::Data,
1295            "SESSIONIZE" => Token::Sessionize,
1296            "GAP" => Token::Gap,
1297            "OVER" => Token::Over,
1298            "ROWS" => Token::Rows,
1299            "PRECEDING" => Token::Preceding,
1300            "FOLLOWING" => Token::Following,
1301            "UNBOUNDED" => Token::Unbounded,
1302            "CURRENT" => Token::Current,
1303            _ => Token::Ident(value),
1304        };
1305
1306        Ok(token)
1307    }
1308
1309    /// Scan less-than variants: <, <=, <>, <-
1310    fn scan_less_than(&mut self) -> Result<Token, LexerError> {
1311        self.advance(); // consume '<'
1312        match self.peek() {
1313            Some('=') => {
1314                self.advance();
1315                Ok(Token::Le)
1316            }
1317            Some('>') => {
1318                self.advance();
1319                Ok(Token::Ne)
1320            }
1321            Some('-') => {
1322                self.advance();
1323                Ok(Token::ArrowLeft)
1324            }
1325            _ => Ok(Token::Lt),
1326        }
1327    }
1328
1329    /// Scan greater-than variants: >, >=
1330    fn scan_greater_than(&mut self) -> Result<Token, LexerError> {
1331        self.advance(); // consume '>'
1332        if self.peek() == Some('=') {
1333            self.advance();
1334            Ok(Token::Ge)
1335        } else {
1336            Ok(Token::Gt)
1337        }
1338    }
1339
1340    /// Scan minus variants: -, ->, --comment
1341    fn scan_minus(&mut self) -> Result<Token, LexerError> {
1342        self.advance(); // consume '-'
1343        match self.peek() {
1344            Some('>') => {
1345                self.advance();
1346                Ok(Token::Arrow)
1347            }
1348            Some('-') => {
1349                // Line comment, skip to end of line
1350                self.advance();
1351                while let Some(c) = self.peek() {
1352                    if c == '\n' {
1353                        break;
1354                    }
1355                    self.advance();
1356                }
1357                // Recursively get next token
1358                self.skip_whitespace_simple();
1359                if self.peek().is_none() {
1360                    Ok(Token::Eof)
1361                } else {
1362                    let next = self.next_token_internal()?;
1363                    Ok(next.token)
1364                }
1365            }
1366            _ => Ok(Token::Dash),
1367        }
1368    }
1369
1370    /// Scan dot variants: ., ..
1371    fn scan_dot(&mut self) -> Result<Token, LexerError> {
1372        self.advance(); // consume '.'
1373        if self.peek() == Some('.') {
1374            self.advance();
1375            Ok(Token::DotDot)
1376        } else {
1377            Ok(Token::Dot)
1378        }
1379    }
1380
1381    /// Look ahead from the current `{` to decide whether this is a
1382    /// JSON object literal (next non-ws char is `"` or `}`) or a
1383    /// legacy brace token (Cypher property bag, Python-style key
1384    /// without quotes, etc.). Pure read — does not advance.
1385    fn looks_like_json_object_start(&self) -> bool {
1386        let bytes = self.input.as_bytes();
1387        let mut i = self.offset as usize;
1388        // We're at `{`. Look one past it.
1389        debug_assert!(bytes.get(i) == Some(&b'{'));
1390        i += 1;
1391        while i < bytes.len() {
1392            match bytes[i] {
1393                b' ' | b'\t' | b'\n' | b'\r' => i += 1,
1394                b'"' | b'}' => return true,
1395                _ => return false,
1396            }
1397        }
1398        false
1399    }
1400
1401    /// JSON sub-mode: scan a balanced `{...}` from the current `{`,
1402    /// returning a `Spanned(Token::JsonLiteral(raw_text), …)`.
1403    ///
1404    /// Tracks string boundaries so `{` and `}` inside `"..."` don't
1405    /// affect the brace counter. Honours `\\`, `\"`, `\\` etc. so an
1406    /// escaped quote does not close the string. Errors on EOF inside
1407    /// an unbalanced literal and on payload size > JSON_LITERAL_MAX_BYTES.
1408    ///
1409    /// State machine:
1410    /// - `Outside` (default): counts `{`/`}`, transitions to `InString` on `"`
1411    /// - `InString`: ignores braces, transitions back to `Outside` on
1412    ///   unescaped `"`. On `\`, transitions to `EscapeInString`.
1413    /// - `EscapeInString`: consume one byte unconditionally then back to
1414    ///   `InString`. Multi-byte UTF-8 sequences after `\u` are handled by
1415    ///   ordinary char iteration; we don't validate the JSON here, just
1416    ///   the brace balance.
1417    fn scan_json_literal(&mut self, start: Position) -> Result<Spanned, LexerError> {
1418        let start_offset = self.offset as usize;
1419        // Consume the opening `{`.
1420        self.advance();
1421        let mut depth: u32 = 1;
1422        let mut in_string = false;
1423        let mut escape = false;
1424        loop {
1425            let ch = match self.peek() {
1426                Some(c) => c,
1427                None => {
1428                    return Err(LexerError::new(
1429                        format!(
1430                            "unterminated JSON object literal (started at offset {})",
1431                            start.offset
1432                        ),
1433                        self.position(),
1434                    ));
1435                }
1436            };
1437
1438            // Enforce payload size limit on the raw scan.
1439            let scanned_bytes = self.offset as usize - start_offset;
1440            if scanned_bytes > JSON_LITERAL_MAX_BYTES {
1441                return Err(LexerError::new(
1442                    format!(
1443                        "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1444                        JSON_LITERAL_MAX_BYTES
1445                    ),
1446                    start,
1447                ));
1448            }
1449
1450            self.advance();
1451
1452            if escape {
1453                escape = false;
1454                continue;
1455            }
1456
1457            if in_string {
1458                match ch {
1459                    '\\' => escape = true,
1460                    '"' => in_string = false,
1461                    _ => {}
1462                }
1463                continue;
1464            }
1465
1466            match ch {
1467                '"' => in_string = true,
1468                '{' => depth += 1,
1469                '}' => {
1470                    depth -= 1;
1471                    if depth == 0 {
1472                        let end = self.position();
1473                        let end_offset = self.offset as usize;
1474                        // Final size check including the trailing `}`.
1475                        if end_offset - start_offset > JSON_LITERAL_MAX_BYTES {
1476                            return Err(LexerError::new(
1477                                format!(
1478                                    "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1479                                    JSON_LITERAL_MAX_BYTES
1480                                ),
1481                                start,
1482                            ));
1483                        }
1484                        let raw = self.input[start_offset..end_offset].to_string();
1485                        return Ok(Spanned::new(Token::JsonLiteral(raw), start, end));
1486                    }
1487                }
1488                _ => {}
1489            }
1490        }
1491    }
1492
1493    /// Tokenize entire input
1494    pub fn tokenize(&mut self) -> Result<Vec<Spanned>, LexerError> {
1495        let mut tokens = Vec::new();
1496        loop {
1497            let tok = self.next_token()?;
1498            let is_eof = tok.token == Token::Eof;
1499            tokens.push(tok);
1500            if is_eof {
1501                break;
1502            }
1503        }
1504        Ok(tokens)
1505    }
1506}
1507
1508// ============================================================================
1509// Tests
1510// ============================================================================
1511
1512#[cfg(test)]
1513mod tests {
1514    use super::*;
1515
1516    fn tokenize(input: &str) -> Vec<Token> {
1517        let mut lexer = Lexer::new(input);
1518        lexer
1519            .tokenize()
1520            .unwrap()
1521            .into_iter()
1522            .map(|s| s.token)
1523            .collect()
1524    }
1525
1526    #[test]
1527    fn test_keywords() {
1528        let tokens = tokenize("SELECT FROM WHERE AND OR NOT");
1529        assert_eq!(
1530            tokens,
1531            vec![
1532                Token::Select,
1533                Token::From,
1534                Token::Where,
1535                Token::And,
1536                Token::Or,
1537                Token::Not,
1538                Token::Eof
1539            ]
1540        );
1541    }
1542
1543    #[test]
1544    fn test_identifiers() {
1545        let tokens = tokenize("hosts users ip_address");
1546        assert_eq!(
1547            tokens,
1548            vec![
1549                Token::Ident("hosts".into()),
1550                Token::Ident("users".into()),
1551                Token::Ident("ip_address".into()),
1552                Token::Eof
1553            ]
1554        );
1555    }
1556
1557    #[test]
1558    fn test_numbers() {
1559        let tokens = tokenize("42 2.5 1e10 2.5e-3");
1560        assert_eq!(
1561            tokens,
1562            vec![
1563                Token::Integer(42),
1564                Token::Float(2.5),
1565                Token::Float(1e10),
1566                Token::Float(2.5e-3),
1567                Token::Eof
1568            ]
1569        );
1570    }
1571
1572    #[test]
1573    fn test_strings() {
1574        let tokens = tokenize("'hello' \"world\" 'it''s'");
1575        assert_eq!(
1576            tokens,
1577            vec![
1578                Token::String("hello".into()),
1579                Token::String("world".into()),
1580                Token::String("it's".into()),
1581                Token::Eof
1582            ]
1583        );
1584    }
1585
1586    #[test]
1587    fn test_operators() {
1588        let tokens = tokenize("= <> < <= > >= != + - * /");
1589        assert_eq!(
1590            tokens,
1591            vec![
1592                Token::Eq,
1593                Token::Ne,
1594                Token::Lt,
1595                Token::Le,
1596                Token::Gt,
1597                Token::Ge,
1598                Token::Ne,
1599                Token::Plus,
1600                Token::Dash,
1601                Token::Star,
1602                Token::Slash,
1603                Token::Eof
1604            ]
1605        );
1606    }
1607
1608    #[test]
1609    fn test_delimiters() {
1610        // Note: `{ a }` (not `{ }`) — a bare `{ }` now triggers JSON
1611        // sub-mode and lexes as a single `JsonLiteral("{ }")` token.
1612        // The brace pair around a non-string token still produces the
1613        // legacy LBrace/RBrace pair (Cypher property bag, etc.).
1614        let tokens = tokenize("( ) [ ] { a } , . : ;");
1615        assert_eq!(
1616            tokens,
1617            vec![
1618                Token::LParen,
1619                Token::RParen,
1620                Token::LBracket,
1621                Token::RBracket,
1622                Token::LBrace,
1623                Token::Ident("a".into()),
1624                Token::RBrace,
1625                Token::Comma,
1626                Token::Dot,
1627                Token::Colon,
1628                Token::Semi,
1629                Token::Eof
1630            ]
1631        );
1632    }
1633
1634    #[test]
1635    fn test_json_literal_empty_object() {
1636        let tokens = tokenize("{ }");
1637        assert_eq!(tokens, vec![Token::JsonLiteral("{ }".into()), Token::Eof]);
1638    }
1639
1640    #[test]
1641    fn test_json_literal_simple() {
1642        let tokens = tokenize(r#"{"a":1}"#);
1643        assert_eq!(
1644            tokens,
1645            vec![Token::JsonLiteral(r#"{"a":1}"#.into()), Token::Eof]
1646        );
1647    }
1648
1649    #[test]
1650    fn test_json_literal_nested() {
1651        let raw = r#"{"a":{"b":[1,2,{"c":"}"}]}}"#;
1652        let tokens = tokenize(raw);
1653        assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1654    }
1655
1656    #[test]
1657    fn test_json_literal_escaped_quote_in_string() {
1658        // The `}` inside the escaped-quote string must not close the object.
1659        let raw = r#"{"path":"O\"Brien}"}"#;
1660        let tokens = tokenize(raw);
1661        assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1662    }
1663
1664    #[test]
1665    fn test_json_literal_unbalanced_eof() {
1666        let mut lexer = Lexer::new(r#"{"a":1"#);
1667        let err = lexer.tokenize().expect_err("expected unterminated error");
1668        assert!(
1669            err.message.contains("unterminated JSON object literal"),
1670            "got: {}",
1671            err.message
1672        );
1673    }
1674
1675    #[test]
1676    fn test_json_literal_property_bag_compatible() {
1677        // Cypher-style property bag must still tokenise as LBrace/.../RBrace
1678        // because the inner content does not start with `"`.
1679        let tokens = tokenize("{name: 'value'}");
1680        assert_eq!(tokens[0], Token::LBrace);
1681        assert_eq!(*tokens.last().unwrap(), Token::Eof);
1682    }
1683
1684    #[test]
1685    fn test_graph_syntax() {
1686        let tokens = tokenize("-> <- - ..");
1687        assert_eq!(
1688            tokens,
1689            vec![
1690                Token::Arrow,
1691                Token::ArrowLeft,
1692                Token::Dash,
1693                Token::DotDot,
1694                Token::Eof
1695            ]
1696        );
1697    }
1698
1699    #[test]
1700    fn test_table_query() {
1701        let tokens = tokenize("SELECT ip, hostname FROM hosts WHERE os = 'Linux' LIMIT 10");
1702        assert_eq!(
1703            tokens,
1704            vec![
1705                Token::Select,
1706                Token::Ident("ip".into()),
1707                Token::Comma,
1708                Token::Ident("hostname".into()),
1709                Token::From,
1710                Token::Ident("hosts".into()),
1711                Token::Where,
1712                Token::Ident("os".into()),
1713                Token::Eq,
1714                Token::String("Linux".into()),
1715                Token::Limit,
1716                Token::Integer(10),
1717                Token::Eof
1718            ]
1719        );
1720    }
1721
1722    #[test]
1723    fn test_graph_query() {
1724        let tokens = tokenize("MATCH (h:Host)-[:HAS_SERVICE]->(s:Service) RETURN h, s");
1725        assert_eq!(
1726            tokens,
1727            vec![
1728                Token::Match,
1729                Token::LParen,
1730                Token::Ident("h".into()),
1731                Token::Colon,
1732                Token::Ident("Host".into()),
1733                Token::RParen,
1734                Token::Dash,
1735                Token::LBracket,
1736                Token::Colon,
1737                Token::Ident("HAS_SERVICE".into()),
1738                Token::RBracket,
1739                Token::Arrow,
1740                Token::LParen,
1741                Token::Ident("s".into()),
1742                Token::Colon,
1743                Token::Ident("Service".into()),
1744                Token::RParen,
1745                Token::Return,
1746                Token::Ident("h".into()),
1747                Token::Comma,
1748                Token::Ident("s".into()),
1749                Token::Eof
1750            ]
1751        );
1752    }
1753
1754    #[test]
1755    fn test_join_query() {
1756        let tokens = tokenize("FROM hosts h JOIN GRAPH (h)-[:HAS_VULN]->(v) ON h.ip = v.id");
1757        assert_eq!(
1758            tokens,
1759            vec![
1760                Token::From,
1761                Token::Ident("hosts".into()),
1762                Token::Ident("h".into()),
1763                Token::Join,
1764                Token::Graph,
1765                Token::LParen,
1766                Token::Ident("h".into()),
1767                Token::RParen,
1768                Token::Dash,
1769                Token::LBracket,
1770                Token::Colon,
1771                Token::Ident("HAS_VULN".into()),
1772                Token::RBracket,
1773                Token::Arrow,
1774                Token::LParen,
1775                Token::Ident("v".into()),
1776                Token::RParen,
1777                Token::On,
1778                Token::Ident("h".into()),
1779                Token::Dot,
1780                Token::Ident("ip".into()),
1781                Token::Eq,
1782                Token::Ident("v".into()),
1783                Token::Dot,
1784                Token::Ident("id".into()),
1785                Token::Eof
1786            ]
1787        );
1788    }
1789
1790    #[test]
1791    fn test_path_query() {
1792        let tokens = tokenize("PATH FROM host('192.168.1.1') TO host('10.0.0.1') VIA [:AUTH]");
1793        assert_eq!(
1794            tokens,
1795            vec![
1796                Token::Path,
1797                Token::From,
1798                Token::Ident("host".into()),
1799                Token::LParen,
1800                Token::String("192.168.1.1".into()),
1801                Token::RParen,
1802                Token::To,
1803                Token::Ident("host".into()),
1804                Token::LParen,
1805                Token::String("10.0.0.1".into()),
1806                Token::RParen,
1807                Token::Via,
1808                Token::LBracket,
1809                Token::Colon,
1810                Token::Ident("AUTH".into()),
1811                Token::RBracket,
1812                Token::Eof
1813            ]
1814        );
1815    }
1816
1817    #[test]
1818    fn test_variable_length_pattern() {
1819        let tokens = tokenize("(a)-[*1..5]->(b)");
1820        assert_eq!(
1821            tokens,
1822            vec![
1823                Token::LParen,
1824                Token::Ident("a".into()),
1825                Token::RParen,
1826                Token::Dash,
1827                Token::LBracket,
1828                Token::Star,
1829                Token::Integer(1),
1830                Token::DotDot,
1831                Token::Integer(5),
1832                Token::RBracket,
1833                Token::Arrow,
1834                Token::LParen,
1835                Token::Ident("b".into()),
1836                Token::RParen,
1837                Token::Eof
1838            ]
1839        );
1840    }
1841
1842    #[test]
1843    fn test_case_insensitive_keywords() {
1844        let tokens = tokenize("select FROM Where AND");
1845        assert_eq!(
1846            tokens,
1847            vec![
1848                Token::Select,
1849                Token::From,
1850                Token::Where,
1851                Token::And,
1852                Token::Eof
1853            ]
1854        );
1855    }
1856
1857    #[test]
1858    fn test_comments() {
1859        let tokens = tokenize("SELECT -- this is a comment\nip FROM hosts");
1860        assert_eq!(
1861            tokens,
1862            vec![
1863                Token::Select,
1864                Token::Ident("ip".into()),
1865                Token::From,
1866                Token::Ident("hosts".into()),
1867                Token::Eof
1868            ]
1869        );
1870    }
1871
1872    #[test]
1873    fn test_escaped_strings() {
1874        let tokens = tokenize(r"'hello\nworld' 'tab\there'");
1875        assert_eq!(
1876            tokens,
1877            vec![
1878                Token::String("hello\nworld".into()),
1879                Token::String("tab\there".into()),
1880                Token::Eof
1881            ]
1882        );
1883    }
1884
1885    #[test]
1886    fn test_keyword_matrix_and_alias_spellings() {
1887        let cases = [
1888            ("SELECT", Token::Select),
1889            ("FROM", Token::From),
1890            ("WHERE", Token::Where),
1891            ("AND", Token::And),
1892            ("OR", Token::Or),
1893            ("NOT", Token::Not),
1894            ("MATCH", Token::Match),
1895            ("RETURN", Token::Return),
1896            ("JOIN", Token::Join),
1897            ("GRAPH", Token::Graph),
1898            ("PATH", Token::Path),
1899            ("TO", Token::To),
1900            ("VIA", Token::Via),
1901            ("ON", Token::On),
1902            ("AS", Token::As),
1903            ("IS", Token::Is),
1904            ("NULL", Token::Null),
1905            ("BETWEEN", Token::Between),
1906            ("LIKE", Token::Like),
1907            ("IN", Token::In),
1908            ("ORDER", Token::Order),
1909            ("BY", Token::By),
1910            ("ASC", Token::Asc),
1911            ("DESC", Token::Desc),
1912            ("NULLS", Token::Nulls),
1913            ("FIRST", Token::First),
1914            ("LAST", Token::Last),
1915            ("LIMIT", Token::Limit),
1916            ("OFFSET", Token::Offset),
1917            ("INNER", Token::Inner),
1918            ("LEFT", Token::Left),
1919            ("RIGHT", Token::Right),
1920            ("OUTER", Token::Outer),
1921            ("FULL", Token::Full),
1922            ("CROSS", Token::Cross),
1923            ("STARTS", Token::Starts),
1924            ("ENDS", Token::Ends),
1925            ("WITH", Token::With),
1926            ("CONTAINS", Token::Contains),
1927            ("TRUE", Token::True),
1928            ("FALSE", Token::False),
1929            ("ENRICH", Token::Enrich),
1930            ("GROUP", Token::Group),
1931            ("COUNT", Token::Count),
1932            ("SUM", Token::Sum),
1933            ("AVG", Token::Avg),
1934            ("MIN", Token::Min),
1935            ("MAX", Token::Max),
1936            ("DISTINCT", Token::Distinct),
1937            ("VECTOR", Token::Vector),
1938            ("SEARCH", Token::Search),
1939            ("SIMILAR", Token::Similar),
1940            ("COLLECTION", Token::Collection),
1941            ("METRIC", Token::Metric),
1942            ("THRESHOLD", Token::Threshold),
1943            ("K", Token::K),
1944            ("HYBRID", Token::Hybrid),
1945            ("FUSION", Token::Fusion),
1946            ("RERANK", Token::Rerank),
1947            ("RRF", Token::Rrf),
1948            ("INTERSECTION", Token::Intersection),
1949            ("UNION", Token::Union),
1950            ("RECURSIVE", Token::Recursive),
1951            ("ALL", Token::All),
1952            ("WEIGHT", Token::Weight),
1953            ("L2", Token::L2),
1954            ("COSINE", Token::Cosine),
1955            ("INNER_PRODUCT", Token::InnerProduct),
1956            ("INNERPRODUCT", Token::InnerProduct),
1957            ("INCLUDE", Token::Include),
1958            ("METADATA", Token::Metadata),
1959            ("VECTORS", Token::Vectors),
1960            ("EXPLAIN", Token::Explain),
1961            ("FOR", Token::For),
1962            ("FORMAT", Token::Format),
1963            ("JSON", Token::Json),
1964            ("INSERT", Token::Insert),
1965            ("INTO", Token::Into),
1966            ("VALUES", Token::Values),
1967            ("UPDATE", Token::Update),
1968            ("SET", Token::Set),
1969            ("DELETE", Token::Delete),
1970            ("TRUNCATE", Token::Truncate),
1971            ("CREATE", Token::Create),
1972            ("TABLE", Token::Table),
1973            ("DROP", Token::Drop),
1974            ("ALTER", Token::Alter),
1975            ("ADD", Token::Add),
1976            ("COLUMN", Token::Column),
1977            ("PRIMARY", Token::Primary),
1978            ("KEY", Token::Key),
1979            ("DEFAULT", Token::Default),
1980            ("COMPRESS", Token::Compress),
1981            ("INDEX", Token::Index),
1982            ("UNIQUE", Token::Unique),
1983            ("IF", Token::If),
1984            ("EXISTS", Token::Exists),
1985            ("RETURNING", Token::Returning),
1986            ("CASCADE", Token::Cascade),
1987            ("RENAME", Token::Rename),
1988            ("USING", Token::Using),
1989            ("NODE", Token::Node),
1990            ("EDGE", Token::Edge),
1991            ("DOCUMENT", Token::Document),
1992            ("KV", Token::Kv),
1993            ("TIMESERIES", Token::Timeseries),
1994            ("RETENTION", Token::Retention),
1995            ("QUEUE", Token::Queue),
1996            ("TREE", Token::Tree),
1997            ("PUSH", Token::Push),
1998            ("POP", Token::Pop),
1999            ("PEEK", Token::Peek),
2000            ("PURGE", Token::Purge),
2001            ("ACK", Token::Ack),
2002            ("NACK", Token::Nack),
2003            ("PRIORITY", Token::Priority),
2004            ("LPUSH", Token::Ident("LPUSH".into())),
2005            ("RPUSH", Token::Ident("RPUSH".into())),
2006            ("LPOP", Token::Ident("LPOP".into())),
2007            ("RPOP", Token::Ident("RPOP".into())),
2008            ("NEIGHBORHOOD", Token::Neighborhood),
2009            ("SHORTEST_PATH", Token::ShortestPath),
2010            ("SHORTESTPATH", Token::ShortestPath),
2011            ("CENTRALITY", Token::Centrality),
2012            ("COMMUNITY", Token::Community),
2013            ("COMPONENTS", Token::Components),
2014            ("CYCLES", Token::Cycles),
2015            ("TRAVERSE", Token::Traverse),
2016            ("DEPTH", Token::Depth),
2017            ("DIRECTION", Token::Direction),
2018            ("ALGORITHM", Token::Algorithm),
2019            ("STRATEGY", Token::Strategy),
2020            ("MAX_ITERATIONS", Token::MaxIterations),
2021            ("MAXITERATIONS", Token::MaxIterations),
2022            ("MAX_LENGTH", Token::MaxLength),
2023            ("MAXLENGTH", Token::MaxLength),
2024            ("MODE", Token::Mode),
2025            ("CLUSTERING", Token::Clustering),
2026            ("TOPOLOGICAL_SORT", Token::TopologicalSort),
2027            ("TOPOLOGICALSORT", Token::TopologicalSort),
2028            ("PROPERTIES", Token::Properties),
2029            ("TEXT", Token::Text),
2030            ("FUZZY", Token::Fuzzy),
2031            ("MIN_SCORE", Token::MinScore),
2032            ("MINSCORE", Token::MinScore),
2033            ("BEGIN", Token::Begin),
2034            ("COMMIT", Token::Commit),
2035            ("ROLLBACK", Token::Rollback),
2036            ("SAVEPOINT", Token::Savepoint),
2037            ("RELEASE", Token::Release),
2038            ("START", Token::Start),
2039            ("TRANSACTION", Token::Transaction),
2040            ("WORK", Token::Work),
2041            ("VACUUM", Token::Vacuum),
2042            ("ANALYZE", Token::Analyze),
2043            ("SCHEMA", Token::Schema),
2044            ("SEQUENCE", Token::Sequence),
2045            ("INCREMENT", Token::Increment),
2046            ("COPY", Token::Copy),
2047            ("HEADER", Token::Header),
2048            ("DELIMITER", Token::Delimiter),
2049            ("VIEW", Token::View),
2050            ("MATERIALIZED", Token::Materialized),
2051            ("REFRESH", Token::Refresh),
2052            ("PARTITION", Token::Partition),
2053            ("RANGE", Token::Range),
2054            ("LIST", Token::List),
2055            ("HASH", Token::Hash),
2056            ("ATTACH", Token::Attach),
2057            ("DETACH", Token::Detach),
2058            ("OF", Token::Of),
2059            ("POLICY", Token::Policy),
2060            ("ENABLE", Token::Enable),
2061            ("DISABLE", Token::Disable),
2062            ("SECURITY", Token::Security),
2063            ("ROW", Token::Row),
2064            ("LEVEL", Token::Level),
2065            ("FOREIGN", Token::Foreign),
2066            ("SERVER", Token::Server),
2067            ("WRAPPER", Token::Wrapper),
2068            ("OPTIONS", Token::Options),
2069            ("DATA", Token::Data),
2070            ("plain_ident", Token::Ident("plain_ident".into())),
2071        ];
2072
2073        for (input, expected) in cases {
2074            let tokens = tokenize(input);
2075            assert_eq!(tokens, vec![expected, Token::Eof], "{input}");
2076        }
2077    }
2078
2079    #[test]
2080    fn test_display_all_token_variants() {
2081        let cases = [
2082            (Token::Select, "SELECT"),
2083            (Token::From, "FROM"),
2084            (Token::Where, "WHERE"),
2085            (Token::And, "AND"),
2086            (Token::Or, "OR"),
2087            (Token::Not, "NOT"),
2088            (Token::Match, "MATCH"),
2089            (Token::Return, "RETURN"),
2090            (Token::Join, "JOIN"),
2091            (Token::Graph, "GRAPH"),
2092            (Token::Path, "PATH"),
2093            (Token::To, "TO"),
2094            (Token::Via, "VIA"),
2095            (Token::On, "ON"),
2096            (Token::As, "AS"),
2097            (Token::Is, "IS"),
2098            (Token::Null, "NULL"),
2099            (Token::Between, "BETWEEN"),
2100            (Token::Like, "LIKE"),
2101            (Token::In, "IN"),
2102            (Token::Order, "ORDER"),
2103            (Token::By, "BY"),
2104            (Token::Asc, "ASC"),
2105            (Token::Desc, "DESC"),
2106            (Token::Nulls, "NULLS"),
2107            (Token::First, "FIRST"),
2108            (Token::Last, "LAST"),
2109            (Token::Limit, "LIMIT"),
2110            (Token::Offset, "OFFSET"),
2111            (Token::Inner, "INNER"),
2112            (Token::Left, "LEFT"),
2113            (Token::Right, "RIGHT"),
2114            (Token::Outer, "OUTER"),
2115            (Token::Full, "FULL"),
2116            (Token::Cross, "CROSS"),
2117            (Token::Starts, "STARTS"),
2118            (Token::Ends, "ENDS"),
2119            (Token::With, "WITH"),
2120            (Token::Contains, "CONTAINS"),
2121            (Token::True, "TRUE"),
2122            (Token::False, "FALSE"),
2123            (Token::Enrich, "ENRICH"),
2124            (Token::Group, "GROUP"),
2125            (Token::Count, "COUNT"),
2126            (Token::Sum, "SUM"),
2127            (Token::Avg, "AVG"),
2128            (Token::Min, "MIN"),
2129            (Token::Max, "MAX"),
2130            (Token::Distinct, "DISTINCT"),
2131            (Token::Vector, "VECTOR"),
2132            (Token::Search, "SEARCH"),
2133            (Token::Similar, "SIMILAR"),
2134            (Token::Collection, "COLLECTION"),
2135            (Token::Metric, "METRIC"),
2136            (Token::Threshold, "THRESHOLD"),
2137            (Token::K, "K"),
2138            (Token::Hybrid, "HYBRID"),
2139            (Token::Fusion, "FUSION"),
2140            (Token::Rerank, "RERANK"),
2141            (Token::Rrf, "RRF"),
2142            (Token::Intersection, "INTERSECTION"),
2143            (Token::Union, "UNION"),
2144            (Token::Recursive, "RECURSIVE"),
2145            (Token::All, "ALL"),
2146            (Token::Weight, "WEIGHT"),
2147            (Token::L2, "L2"),
2148            (Token::Cosine, "COSINE"),
2149            (Token::InnerProduct, "INNER_PRODUCT"),
2150            (Token::Include, "INCLUDE"),
2151            (Token::Metadata, "METADATA"),
2152            (Token::Vectors, "VECTORS"),
2153            (Token::Explain, "EXPLAIN"),
2154            (Token::For, "FOR"),
2155            (Token::Format, "FORMAT"),
2156            (Token::Json, "JSON"),
2157            (Token::Insert, "INSERT"),
2158            (Token::Into, "INTO"),
2159            (Token::Values, "VALUES"),
2160            (Token::Update, "UPDATE"),
2161            (Token::Set, "SET"),
2162            (Token::Delete, "DELETE"),
2163            (Token::Truncate, "TRUNCATE"),
2164            (Token::Create, "CREATE"),
2165            (Token::Table, "TABLE"),
2166            (Token::Drop, "DROP"),
2167            (Token::Alter, "ALTER"),
2168            (Token::Add, "ADD"),
2169            (Token::Column, "COLUMN"),
2170            (Token::Primary, "PRIMARY"),
2171            (Token::Key, "KEY"),
2172            (Token::Default, "DEFAULT"),
2173            (Token::Compress, "COMPRESS"),
2174            (Token::Index, "INDEX"),
2175            (Token::Unique, "UNIQUE"),
2176            (Token::If, "IF"),
2177            (Token::Exists, "EXISTS"),
2178            (Token::Returning, "RETURNING"),
2179            (Token::Cascade, "CASCADE"),
2180            (Token::Rename, "RENAME"),
2181            (Token::Using, "USING"),
2182            (Token::Node, "NODE"),
2183            (Token::Edge, "EDGE"),
2184            (Token::Document, "DOCUMENT"),
2185            (Token::Kv, "KV"),
2186            (Token::Timeseries, "TIMESERIES"),
2187            (Token::Retention, "RETENTION"),
2188            (Token::Queue, "QUEUE"),
2189            (Token::Tree, "TREE"),
2190            (Token::Push, "PUSH"),
2191            (Token::Pop, "POP"),
2192            (Token::Peek, "PEEK"),
2193            (Token::Purge, "PURGE"),
2194            (Token::Ack, "ACK"),
2195            (Token::Nack, "NACK"),
2196            (Token::Priority, "PRIORITY"),
2197            (Token::Neighborhood, "NEIGHBORHOOD"),
2198            (Token::ShortestPath, "SHORTEST_PATH"),
2199            (Token::Centrality, "CENTRALITY"),
2200            (Token::Community, "COMMUNITY"),
2201            (Token::Components, "COMPONENTS"),
2202            (Token::Cycles, "CYCLES"),
2203            (Token::Traverse, "TRAVERSE"),
2204            (Token::Depth, "DEPTH"),
2205            (Token::Direction, "DIRECTION"),
2206            (Token::Algorithm, "ALGORITHM"),
2207            (Token::Strategy, "STRATEGY"),
2208            (Token::MaxIterations, "MAX_ITERATIONS"),
2209            (Token::MaxLength, "MAX_LENGTH"),
2210            (Token::Mode, "MODE"),
2211            (Token::Clustering, "CLUSTERING"),
2212            (Token::TopologicalSort, "TOPOLOGICAL_SORT"),
2213            (Token::Properties, "PROPERTIES"),
2214            (Token::Text, "TEXT"),
2215            (Token::Fuzzy, "FUZZY"),
2216            (Token::MinScore, "MIN_SCORE"),
2217            (Token::Begin, "BEGIN"),
2218            (Token::Commit, "COMMIT"),
2219            (Token::Rollback, "ROLLBACK"),
2220            (Token::Savepoint, "SAVEPOINT"),
2221            (Token::Release, "RELEASE"),
2222            (Token::Start, "START"),
2223            (Token::Transaction, "TRANSACTION"),
2224            (Token::Work, "WORK"),
2225            (Token::Vacuum, "VACUUM"),
2226            (Token::Analyze, "ANALYZE"),
2227            (Token::Schema, "SCHEMA"),
2228            (Token::Sequence, "SEQUENCE"),
2229            (Token::Increment, "INCREMENT"),
2230            (Token::Copy, "COPY"),
2231            (Token::Header, "HEADER"),
2232            (Token::Delimiter, "DELIMITER"),
2233            (Token::View, "VIEW"),
2234            (Token::Materialized, "MATERIALIZED"),
2235            (Token::Refresh, "REFRESH"),
2236            (Token::Partition, "PARTITION"),
2237            (Token::Range, "RANGE"),
2238            (Token::List, "LIST"),
2239            (Token::Hash, "HASH"),
2240            (Token::Attach, "ATTACH"),
2241            (Token::Detach, "DETACH"),
2242            (Token::Of, "OF"),
2243            (Token::Policy, "POLICY"),
2244            (Token::Enable, "ENABLE"),
2245            (Token::Disable, "DISABLE"),
2246            (Token::Security, "SECURITY"),
2247            (Token::Row, "ROW"),
2248            (Token::Level, "LEVEL"),
2249            (Token::Foreign, "FOREIGN"),
2250            (Token::Server, "SERVER"),
2251            (Token::Wrapper, "WRAPPER"),
2252            (Token::Options, "OPTIONS"),
2253            (Token::Data, "DATA"),
2254            (Token::String("x".into()), "'x'"),
2255            (Token::Integer(7), "7"),
2256            (Token::Float(1.5), "1.5"),
2257            (Token::JsonLiteral(r#"{"x":1}"#.into()), r#"{"x":1}"#),
2258            (Token::Ident("id".into()), "id"),
2259            (Token::Eq, "="),
2260            (Token::Ne, "<>"),
2261            (Token::Lt, "<"),
2262            (Token::Le, "<="),
2263            (Token::Gt, ">"),
2264            (Token::Ge, ">="),
2265            (Token::Plus, "+"),
2266            (Token::Minus, "-"),
2267            (Token::Star, "*"),
2268            (Token::Slash, "/"),
2269            (Token::Percent, "%"),
2270            (Token::LParen, "("),
2271            (Token::RParen, ")"),
2272            (Token::LBracket, "["),
2273            (Token::RBracket, "]"),
2274            (Token::LBrace, "{"),
2275            (Token::RBrace, "}"),
2276            (Token::Comma, ","),
2277            (Token::Dot, "."),
2278            (Token::Colon, ":"),
2279            (Token::Semi, ";"),
2280            (Token::Dollar, "$"),
2281            (Token::FatArrow, "=>"),
2282            (Token::Arrow, "->"),
2283            (Token::ArrowLeft, "<-"),
2284            (Token::Dash, "-"),
2285            (Token::DotDot, ".."),
2286            (Token::Pipe, "|"),
2287            (Token::DoublePipe, "||"),
2288            (Token::Eof, "EOF"),
2289        ];
2290
2291        for (token, expected) in cases {
2292            assert_eq!(token.to_string(), expected);
2293        }
2294    }
2295
2296    #[test]
2297    fn fat_arrow_lexes_distinctly_from_eq() {
2298        // `=>` is the named-argument arrow; a bare `=` stays equality.
2299        assert_eq!(
2300            tokenize("resolution => 0.5"),
2301            vec![
2302                Token::Ident("resolution".into()),
2303                Token::FatArrow,
2304                Token::Float(0.5),
2305                Token::Eof,
2306            ]
2307        );
2308        assert_eq!(
2309            tokenize("x = 1"),
2310            vec![
2311                Token::Ident("x".into()),
2312                Token::Eq,
2313                Token::Integer(1),
2314                Token::Eof,
2315            ]
2316        );
2317    }
2318
2319    #[test]
2320    fn test_string_escape_and_error_matrix() {
2321        let tokens = tokenize(
2322            r#"'line\nrow' 'carriage\rreturn' 'tab\tstop' 'slash\\' 'quote\'' "dq\"" 'raw\z'"#,
2323        );
2324        assert_eq!(
2325            tokens,
2326            vec![
2327                Token::String("line\nrow".into()),
2328                Token::String("carriage\rreturn".into()),
2329                Token::String("tab\tstop".into()),
2330                Token::String("slash\\".into()),
2331                Token::String("quote'".into()),
2332                Token::String("dq\"".into()),
2333                Token::String(r"raw\z".into()),
2334                Token::Eof
2335            ]
2336        );
2337
2338        let mut lexer = Lexer::new("'unterminated");
2339        assert!(lexer
2340            .next_token()
2341            .unwrap_err()
2342            .message
2343            .contains("Unterminated string"));
2344
2345        let mut lexer = Lexer::new(r"'bad\");
2346        assert!(lexer
2347            .next_token()
2348            .unwrap_err()
2349            .message
2350            .contains("Unterminated string"));
2351    }
2352
2353    #[test]
2354    fn test_operator_comment_peek_limit_and_tokenize_paths() {
2355        let tokens = tokenize("!= % ; $ || | 123.abc 1..2 1e+2 <- -> /* block */ SELECT");
2356        assert_eq!(
2357            tokens,
2358            vec![
2359                Token::Ne,
2360                Token::Percent,
2361                Token::Semi,
2362                Token::Dollar,
2363                Token::DoublePipe,
2364                Token::Pipe,
2365                Token::Integer(123),
2366                Token::Dot,
2367                Token::Ident("abc".into()),
2368                Token::Integer(1),
2369                Token::DotDot,
2370                Token::Integer(2),
2371                Token::Float(1e2),
2372                Token::ArrowLeft,
2373                Token::Arrow,
2374                Token::Select,
2375                Token::Eof,
2376            ]
2377        );
2378
2379        let mut lexer = Lexer::new("SELECT FROM");
2380        assert_eq!(lexer.peek_token().unwrap().token, Token::Select);
2381        assert_eq!(lexer.next_token().unwrap().token, Token::Select);
2382        assert_eq!(lexer.next_token().unwrap().token, Token::From);
2383
2384        let mut lexer = Lexer::new("!");
2385        assert!(lexer
2386            .next_token()
2387            .unwrap_err()
2388            .message
2389            .contains("Expected '=' after '!'"));
2390
2391        let limits = crate::limits::ParserLimits {
2392            max_identifier_chars: 3,
2393            ..crate::limits::ParserLimits::default()
2394        };
2395        let mut lexer = Lexer::with_limits("abcd", limits);
2396        assert_eq!(lexer.max_identifier_chars(), 3);
2397        let err = lexer.next_token().unwrap_err();
2398        assert!(matches!(
2399            err.limit_hit,
2400            Some(LexerLimitHit::IdentifierTooLong { value: 3, .. })
2401        ));
2402    }
2403}