spg_sql/
lexer.rs

1//! Lexer for the PG-dialect subset that SPG accepts.
2//!
3//! v0.2 token stream is value-only — no source spans yet. Errors do report
4//! the byte offset where the offending construct started. Identifiers are
5//! ASCII case-folded to lower-case (matches PG when un-quoted). Quoted
6//! identifiers (`"..."`) preserve case; `""` is an embedded quote.
7//! String literals (`'...'`) follow PG single-quote convention with `''`
8//! as the embedded quote. The lexer accepts but does not interpret E-strings
9//! or dollar-quoted strings — those land in a later milestone.
10
11use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords
18    Select,
19    From,
20    Where,
21    As,
22    Null,
23    True,
24    False,
25    And,
26    Or,
27    Not,
28    Create,
29    Table,
30    Insert,
31    Into,
32    Values,
33    Index,
34    On,
35    Begin,
36    Commit,
37    Rollback,
38    Order,
39    By,
40    Limit,
41
42    // Identifiers
43    Ident(String),       // ASCII case-folded
44    QuotedIdent(String), // original case, "" → "
45    /// v7.14.0 — MySQL session / user variable reference
46    /// (`@VAR` / `@@VAR`). The wrapped string is the verbatim
47    /// source form (including the `@` / `@@` prefix). Used by
48    /// mysqldump preamble (`SET @OLD_FOREIGN_KEY_CHECKS =
49    /// @@FOREIGN_KEY_CHECKS, …`); SPG accepts the token and
50    /// the SET parser treats the assignment as a no-op apart
51    /// from any second LHS that targets a real session
52    /// parameter (e.g. `FOREIGN_KEY_CHECKS=0`).
53    SessionVar(String),
54
55    // Literals
56    Integer(i64),
57    Float(f64),
58    String(String),
59
60    // Operators
61    Plus,
62    Minus,
63    Star,
64    Slash,
65    Eq,
66    NotEq,
67    Lt,
68    LtEq,
69    Gt,
70    GtEq,
71    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contained-in
72    /// `<<`. LHS is strictly inside RHS (no equality).
73    InetContainedBy,
74    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contained-in-or-equal
75    /// `<<=`. LHS network ⊆ RHS network.
76    InetContainedByEq,
77    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contains `>>`.
78    /// LHS strictly contains RHS.
79    InetContains,
80    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contains-or-equal `>>=`.
81    /// LHS network ⊇ RHS network.
82    InetContainsEq,
83    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR network overlap `&&`.
84    /// Either side contains any address of the other.
85    InetOverlap,
86
87    // Punctuation
88    LParen,
89    RParen,
90    LBracket,
91    RBracket,
92    Comma,
93    Semicolon,
94    Dot,
95    /// v7.17.0 Phase 2.6 — standalone `@` punctuation. Emitted when
96    /// `@` is NOT followed by an ident-start byte (i.e. the
97    /// `@VAR` / `@@VAR` SessionVar path doesn't match). Lets the
98    /// parser stitch the MySQL `'user'@'host'` DEFINER form back
99    /// together as String + At + String. Pre-2.6 this same shape
100    /// surfaced as a `LexErrorKind::UnknownChar('@')` and broke
101    /// every mysqldump CREATE VIEW with a DEFINER clause at lex
102    /// time.
103    At,
104    /// pgvector L2 distance operator `<->`. Lexed as one token so the
105    /// parser can give it its own precedence rung.
106    /// v4.14 `->` — JSON object/array element access, returns json.
107    JsonGet,
108    /// v4.14 `->>` — same access, returns text.
109    JsonGetText,
110    /// v6.4.5 `#>` — JSON path walk, returns json. Path is the
111    /// right-hand TEXT with PG `{a,b,0}` syntax.
112    JsonGetPath,
113    /// v6.4.5 `#>>` — same walk, returns text.
114    JsonGetPathText,
115    /// v6.4.5 `@>` — JSON containment. `j @> sub` returns true if
116    /// every key/value in `sub` is present in `j` with structural
117    /// containment for objects + arrays.
118    JsonContains,
119    /// v7.12.2 `@@` — tsvector / tsquery match. Either ordering
120    /// (`vec @@ q` or `q @@ vec`) parses; engine eval normalises
121    /// before matching.
122    TsMatch,
123    L2Distance,
124    /// pgvector inner-product operator `<#>` (returns negative dot product
125    /// so smaller still means more similar — same semantics as pgvector).
126    InnerProduct,
127    /// pgvector cosine distance operator `<=>`.
128    CosineDistance,
129    /// PG-style cast `expr::type` — single token because we want it to bind
130    /// at postfix precedence.
131    DoubleColon,
132    /// v7.12.4 — PL/pgSQL assignment operator `:=`.
133    /// Outside PL/pgSQL bodies this token has no SQL-side meaning.
134    ColonEq,
135    /// v7.12.4 — bare `:` separator. Used inside `tsvector` external-form
136    /// literals (`'cat:1 dog:2'::tsvector`) and as the fallback path for
137    /// the PL/pgSQL assignment lexer.
138    Colon,
139    /// Standard SQL string concatenation `||`.
140    Concat,
141    /// `IS` keyword — postfix `IS NULL` / `IS NOT NULL` predicates.
142    Is,
143    Between,
144    In,
145    Like,
146    Group,
147    Distinct,
148    Union,
149    All,
150    Join,
151    Inner,
152    Left,
153    Cross,
154    Outer,
155    Default,
156    Savepoint,
157    Release,
158    To,
159    Having,
160    Show,
161    Extract,
162    Offset,
163    Asc,
164    Desc,
165    /// `INTERVAL` — followed by a string literal carrying the span text
166    /// (e.g. `INTERVAL '1 day 2 hours'`).
167    Interval,
168    /// v6.1.1 — `$N` parameter placeholder for the extended query
169    /// protocol. The number N is 1-based per PostgreSQL convention.
170    /// `0` and `$0` are not valid; the lexer rejects them.
171    Placeholder(u16),
172
173    /// v6.1.2 — `DROP` keyword. Used by `DROP PUBLICATION <name>`.
174    /// Reserved for future `DROP TABLE` / `DROP INDEX` / `DROP USER`
175    /// surface that currently goes through SHOW-shaped admin SQL.
176    Drop,
177    /// v6.1.2 — `FOR` keyword (publication scope).
178    For,
179    /// v6.1.2 — `TABLES` plural keyword (`FOR ALL TABLES`,
180    /// `FOR ALL TABLES EXCEPT …`). The existing `TABLE` keyword
181    /// stays a separate token so `CREATE TABLE`'s single-table
182    /// form keeps lexing as today.
183    Tables,
184    /// v6.1.3 (reserved at v6.1.2 to keep the AST shape stable) —
185    /// `EXCEPT` keyword for `FOR ALL TABLES EXCEPT t1, t2`.
186    Except,
187    /// v6.1.2 — `PUBLICATION` keyword.
188    Publication,
189    /// v6.1.4 (reserved at v6.1.2) — `SUBSCRIPTION` keyword.
190    Subscription,
191    /// v6.1.4 — `CONNECTION` keyword (for
192    /// `CREATE SUBSCRIPTION … CONNECTION '<conn_str>' …`).
193    Connection,
194
195    Eof,
196}
197
198#[derive(Debug, Clone, PartialEq, Eq)]
199pub enum LexErrorKind {
200    UnknownChar(char),
201    UnterminatedString,
202    UnterminatedQuotedIdent,
203    UnterminatedBlockComment,
204    BadNumber(String),
205}
206
207#[derive(Debug, Clone, PartialEq, Eq)]
208pub struct LexError {
209    pub kind: LexErrorKind,
210    pub pos: usize,
211}
212
213impl fmt::Display for LexError {
214    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
215        match &self.kind {
216            LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
217            LexErrorKind::UnterminatedString => {
218                write!(f, "unterminated string literal at byte {}", self.pos)
219            }
220            LexErrorKind::UnterminatedQuotedIdent => {
221                write!(f, "unterminated quoted identifier at byte {}", self.pos)
222            }
223            LexErrorKind::UnterminatedBlockComment => {
224                write!(f, "unterminated /* */ comment at byte {}", self.pos)
225            }
226            LexErrorKind::BadNumber(s) => {
227                write!(f, "invalid number literal {s:?} at byte {}", self.pos)
228            }
229        }
230    }
231}
232
233/// Tokenize `input` into a `Vec<Token>` ending in `Token::Eof`.
234#[allow(clippy::too_many_lines)] // big match — splitting would obscure the dispatch table
235pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
236    let bytes = input.as_bytes();
237    let mut i = 0usize;
238    let mut out = Vec::new();
239
240    while i < bytes.len() {
241        let b = bytes[i];
242        match b {
243            b' ' | b'\t' | b'\n' | b'\r' => {
244                i += 1;
245            }
246            b'-' if peek_eq(bytes, i + 1, b'-') => {
247                i += 2;
248                while i < bytes.len() && bytes[i] != b'\n' {
249                    i += 1;
250                }
251            }
252            b'/' if peek_eq(bytes, i + 1, b'*') => {
253                let start = i;
254                // v7.14.0 — MySQL versioned conditional comment
255                // `/*!NNNNN <body> */`. The body is real SQL that
256                // MySQL/MariaDB executes when the runtime version
257                // matches the 5-digit code; PG strips the whole
258                // thing as a block comment. SPG sides with MySQL
259                // semantics for dump compatibility: skip the
260                // `/*!NNNNN ` prefix and continue lexing the body
261                // as ordinary tokens. The closing `*/` is later
262                // matched + skipped by the symmetric arm below.
263                if peek_eq(bytes, i + 2, b'!') {
264                    let mut j = i + 3;
265                    // skip the optional 5-digit version code +
266                    // following single whitespace
267                    while j < bytes.len() && bytes[j].is_ascii_digit() {
268                        j += 1;
269                    }
270                    if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
271                        j += 1;
272                    }
273                    i = j;
274                    continue;
275                }
276                i += 2;
277                let mut closed = false;
278                while i + 1 < bytes.len() {
279                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
280                        i += 2;
281                        closed = true;
282                        break;
283                    }
284                    i += 1;
285                }
286                if !closed {
287                    return Err(LexError {
288                        kind: LexErrorKind::UnterminatedBlockComment,
289                        pos: start,
290                    });
291                }
292            }
293            // v7.14.0 — bare `*/` (closing of the v7.14 MySQL
294            // versioned-comment opener that didn't consume the
295            // closer). We treat it as an inline comment terminator
296            // and skip 2 bytes.
297            b'*' if peek_eq(bytes, i + 1, b'/') => {
298                i += 2;
299            }
300            b'\'' => {
301                let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
302                out.push(tok);
303                i += consumed;
304            }
305            b'"' => {
306                let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
307                out.push(tok);
308                i += consumed;
309            }
310            // MySQL-flavoured backtick-quoted identifier. Same semantics
311            // as the standard `"..."` form, including embedded "``" as
312            // a literal backtick.
313            b'`' => {
314                let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
315                out.push(tok);
316                i += consumed;
317            }
318            b if b.is_ascii_alphabetic() || b == b'_' => {
319                let start = i;
320                i += 1;
321                while i < bytes.len() {
322                    let c = bytes[i];
323                    if c.is_ascii_alphanumeric() || c == b'_' {
324                        i += 1;
325                    } else {
326                        break;
327                    }
328                }
329                let raw = &input[start..i];
330                // v3.0.5: try the keyword table case-insensitively
331                // without allocating; only the ident fall-through
332                // pays for a lowercase String.
333                out.push(keyword_or_ident_raw(raw));
334            }
335            b if b.is_ascii_digit() => {
336                let (tok, consumed) =
337                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
338                out.push(tok);
339                i += consumed;
340            }
341            b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
342                let (tok, consumed) =
343                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
344                out.push(tok);
345                i += consumed;
346            }
347            b'+' => single(&mut out, Token::Plus, &mut i),
348            b'-' => {
349                // v4.14: `->>` and `->` for JSON path access. `->>`
350                // must be tried before `->` (longest match).
351                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
352                    out.push(Token::JsonGetText);
353                    i += 3;
354                } else if peek_eq(bytes, i + 1, b'>') {
355                    out.push(Token::JsonGet);
356                    i += 2;
357                } else {
358                    single(&mut out, Token::Minus, &mut i);
359                }
360            }
361            // v6.4.5: `#>>` and `#>` JSON path walk.
362            b'#' => {
363                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
364                    out.push(Token::JsonGetPathText);
365                    i += 3;
366                } else if peek_eq(bytes, i + 1, b'>') {
367                    out.push(Token::JsonGetPath);
368                    i += 2;
369                } else {
370                    return Err(LexError {
371                        kind: LexErrorKind::UnknownChar('#'),
372                        pos: i,
373                    });
374                }
375            }
376            // v6.4.5: `@>` JSON containment.
377            // v7.12.2: `@@` tsvector / tsquery match.
378            // v7.14.0: `@@NAME` MySQL session variable ref +
379            //          `@NAME` user variable ref. mysqldump preamble
380            //          uses both heavily (`SET @OLD_FOREIGN_KEY_CHECKS
381            //          = @@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0`).
382            //          We lex both as a single SessionVar token so
383            //          the parser can accept and ignore them.
384            b'@' => {
385                if peek_eq(bytes, i + 1, b'>') {
386                    out.push(Token::JsonContains);
387                    i += 2;
388                } else if peek_eq(bytes, i + 1, b'@')
389                    && !is_session_var_ident_start(bytes.get(i + 2).copied())
390                {
391                    // `@@` not followed by an ident-start byte is
392                    // the tsquery `@@` operator.
393                    out.push(Token::TsMatch);
394                    i += 2;
395                } else {
396                    // `@VAR` / `@@VAR` — MySQL user / session
397                    // variable reference. Consume the ident-shaped
398                    // tail and emit as Token::SessionVar so the
399                    // SET parser can accept-and-ignore.
400                    let prefix_end = if peek_eq(bytes, i + 1, b'@') {
401                        i + 2
402                    } else {
403                        i + 1
404                    };
405                    let mut end = prefix_end;
406                    while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
407                        end += 1;
408                    }
409                    if end == prefix_end {
410                        // v7.17.0 Phase 2.6 — `@` not followed by an
411                        // ident-shaped tail. mysqldump's DEFINER
412                        // form `'user'@'host'` lands here (next
413                        // byte is `'`). Emit as Token::At so the
414                        // parser can stitch the surrounding String
415                        // tokens. Single `@@` already short-circuits
416                        // to Token::TsMatch above, so this only
417                        // fires for a true lone `@`.
418                        out.push(Token::At);
419                        i = prefix_end;
420                        continue;
421                    }
422                    out.push(Token::SessionVar(input[i..end].to_string()));
423                    i = end;
424                }
425            }
426            b'*' => single(&mut out, Token::Star, &mut i),
427            b'/' => single(&mut out, Token::Slash, &mut i),
428            b'(' => single(&mut out, Token::LParen, &mut i),
429            b')' => single(&mut out, Token::RParen, &mut i),
430            b'[' => single(&mut out, Token::LBracket, &mut i),
431            b']' => single(&mut out, Token::RBracket, &mut i),
432            b',' => single(&mut out, Token::Comma, &mut i),
433            b';' => single(&mut out, Token::Semicolon, &mut i),
434            b'.' => single(&mut out, Token::Dot, &mut i),
435            b'=' => single(&mut out, Token::Eq, &mut i),
436            b'<' => {
437                if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
438                    out.push(Token::CosineDistance);
439                    i += 3;
440                } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
441                    out.push(Token::InnerProduct);
442                    i += 3;
443                } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
444                    out.push(Token::L2Distance);
445                    i += 3;
446                } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
447                    // v7.17.0 Phase 3.P0-47 — PG INET `<<=` contained-or-equal.
448                    out.push(Token::InetContainedByEq);
449                    i += 3;
450                } else if peek_eq(bytes, i + 1, b'<') {
451                    // v7.17.0 Phase 3.P0-47 — PG INET `<<` strict contained.
452                    out.push(Token::InetContainedBy);
453                    i += 2;
454                } else if peek_eq(bytes, i + 1, b'=') {
455                    out.push(Token::LtEq);
456                    i += 2;
457                } else if peek_eq(bytes, i + 1, b'>') {
458                    out.push(Token::NotEq);
459                    i += 2;
460                } else {
461                    out.push(Token::Lt);
462                    i += 1;
463                }
464            }
465            b':' if peek_eq(bytes, i + 1, b':') => {
466                out.push(Token::DoubleColon);
467                i += 2;
468            }
469            b':' if peek_eq(bytes, i + 1, b'=') => {
470                // v7.12.4 — PL/pgSQL assignment operator `:=`.
471                out.push(Token::ColonEq);
472                i += 2;
473            }
474            b':' => {
475                // v7.12.4 — bare `:`. Used inside `tsvector` external-form
476                // literals which the cast parser consumes in-token, and as a
477                // separator the PL/pgSQL assignment lexer can recover from.
478                out.push(Token::Colon);
479                i += 1;
480            }
481            b'|' if peek_eq(bytes, i + 1, b'|') => {
482                out.push(Token::Concat);
483                i += 2;
484            }
485            b'>' => {
486                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
487                    // v7.17.0 Phase 3.P0-47 — PG INET `>>=` contains-or-equal.
488                    out.push(Token::InetContainsEq);
489                    i += 3;
490                } else if peek_eq(bytes, i + 1, b'>') {
491                    // v7.17.0 Phase 3.P0-47 — PG INET `>>` strict contains.
492                    out.push(Token::InetContains);
493                    i += 2;
494                } else if peek_eq(bytes, i + 1, b'=') {
495                    out.push(Token::GtEq);
496                    i += 2;
497                } else {
498                    out.push(Token::Gt);
499                    i += 1;
500                }
501            }
502            b'&' if peek_eq(bytes, i + 1, b'&') => {
503                // v7.17.0 Phase 3.P0-47 — PG INET network overlap `&&`.
504                out.push(Token::InetOverlap);
505                i += 2;
506            }
507            b'!' if peek_eq(bytes, i + 1, b'=') => {
508                out.push(Token::NotEq);
509                i += 2;
510            }
511            // v7.9.27 — PG dollar-quoted string `$$ … $$` (or
512            // `$tag$ … $tag$`). Used in `DO $$ … $$ LANGUAGE
513            // plpgsql;` blocks that pg_dump emits for idempotent
514            // migrations. SPG has no PL/pgSQL, so the lexer
515            // consumes the entire string as a single Token::String
516            // and the parser treats the surrounding `DO …;` as a
517            // no-op. mailrs follow-up H1.
518            b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
519                // Empty tag form: `$$ … $$`.
520                let end = find_dollar_tag_end(bytes, i + 2, b"$$");
521                let body = match end {
522                    Some(e) => &input[i + 2..e],
523                    None => {
524                        return Err(LexError {
525                            kind: LexErrorKind::UnterminatedString,
526                            pos: i,
527                        });
528                    }
529                };
530                out.push(Token::String(body.to_string()));
531                i = end.unwrap() + 2;
532            }
533            b'$' if i + 1 < bytes.len()
534                && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
535            {
536                // Tagged form: `$foo$ … $foo$`. Scan the tag
537                // ident, find the closing copy.
538                let mut j = i + 1;
539                while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
540                    j += 1;
541                }
542                if j >= bytes.len() || bytes[j] != b'$' {
543                    // Not a dollar-quoted string — fall through
544                    // to the generic-unknown-char path.
545                    let ch = input[i..].chars().next().unwrap_or('?');
546                    return Err(LexError {
547                        kind: LexErrorKind::UnknownChar(ch),
548                        pos: i,
549                    });
550                }
551                let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
552                let end = find_dollar_tag_end(bytes, j + 1, &close);
553                let body = match end {
554                    Some(e) => &input[j + 1..e],
555                    None => {
556                        return Err(LexError {
557                            kind: LexErrorKind::UnterminatedString,
558                            pos: i,
559                        });
560                    }
561                };
562                out.push(Token::String(body.to_string()));
563                i = end.unwrap() + close.len();
564            }
565            // v6.1.1: `$N` parameter placeholder for the extended
566            // query protocol. PG numbers them 1..=N; we reject $0
567            // and a bare `$` not followed by a digit.
568            b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
569                let mut j = i + 1;
570                let mut n: u32 = 0;
571                while j < bytes.len() && bytes[j].is_ascii_digit() {
572                    n = n
573                        .saturating_mul(10)
574                        .saturating_add(u32::from(bytes[j] - b'0'));
575                    j += 1;
576                }
577                if n == 0 || n > u32::from(u16::MAX) {
578                    return Err(LexError {
579                        kind: LexErrorKind::BadNumber(input[i..j].to_string()),
580                        pos: i,
581                    });
582                }
583                #[allow(clippy::cast_possible_truncation)]
584                out.push(Token::Placeholder(n as u16));
585                i = j;
586            }
587            _ => {
588                let ch = input[i..].chars().next().unwrap_or('?');
589                return Err(LexError {
590                    kind: LexErrorKind::UnknownChar(ch),
591                    pos: i,
592                });
593            }
594        }
595    }
596    out.push(Token::Eof);
597    Ok(out)
598}
599
600fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
601    bytes.get(i) == Some(&target)
602}
603
604/// v7.14.0 — recognise the first byte of a MySQL session/user
605/// variable name (after `@` or `@@`). PG-strict idents are ASCII
606/// letter or underscore; MySQL also allows leading digits inside
607/// quoted names but unquoted vars match the same shape.
608fn is_session_var_ident_start(b: Option<u8>) -> bool {
609    matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
610}
611
612/// Continuation byte for a `@VAR`/`@@VAR` ident (after the first
613/// alphabet/underscore byte). Letters, digits, underscore, dot
614/// (MySQL allows session-scope qualifiers like
615/// `@@global.sql_mode`) and `$` (some MySQL versions accept it).
616fn is_session_var_ident_continue(b: u8) -> bool {
617    b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
618}
619
620/// v7.9.27 — find the start index of the next occurrence of `tag`
621/// (e.g. `b"$$"` or `b"$foo$"`) in `bytes` starting at `from`.
622fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
623    if tag.is_empty() || from > bytes.len() {
624        return None;
625    }
626    let mut i = from;
627    while i + tag.len() <= bytes.len() {
628        if &bytes[i..i + tag.len()] == tag {
629            return Some(i);
630        }
631        i += 1;
632    }
633    None
634}
635
636fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
637    bytes.get(i).is_some_and(pred)
638}
639
640fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
641    out.push(tok);
642    *i += 1;
643}
644
645/// Length-first ASCII-CI keyword lookup. Avoids allocating a
646/// lowercase `String` when the input matches a keyword; only the ident
647/// fall-through path pays for the lowercase copy.
648///
649/// Grouped by length so the outer `match` becomes a small jump table.
650/// Within a length bucket every keyword has either a unique first
651/// byte (cheap dispatch) or a small set of disambiguating
652/// trailing-byte comparisons. All comparisons are ASCII-CI (XOR
653/// 0x20 on each byte before the compare).
654fn keyword_or_ident_raw(raw: &str) -> Token {
655    let b = raw.as_bytes();
656    let tok = match b.len() {
657        2 => kw_len2(b),
658        3 => kw_len3(b),
659        4 => kw_len4(b),
660        5 => kw_len5(b),
661        6 => kw_len6(b),
662        7 => kw_len7(b),
663        8 => kw_len8(b),
664        9 => kw_len9(b),
665        10 => kw_len10(b),
666        11 => kw_len11(b),
667        12 => kw_len12(b),
668        _ => None,
669    };
670    match tok {
671        Some(t) => t,
672        // Ident fall-through: this is the only path that allocates.
673        None => Token::Ident(raw.to_ascii_lowercase()),
674    }
675}
676
677/// ASCII-CI equality on a byte slice against a lowercase literal.
678/// Letters that differ only in case satisfy `(a ^ b) == 0x20`; other
679/// mismatches set bits outside the 0x20 mask. We compare each byte
680/// against its lowercase form via `to_ascii_lowercase` for clarity;
681/// the compiler folds the loop into a tight cmov chain.
682#[inline]
683fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
684    if input.len() != lower.len() {
685        return false;
686    }
687    for i in 0..lower.len() {
688        if input[i].to_ascii_lowercase() != lower[i] {
689            return false;
690        }
691    }
692    true
693}
694
695#[inline]
696fn kw_len2(b: &[u8]) -> Option<Token> {
697    // 7 keywords: as, by, in, is, on, or, to
698    if eq_ci(b, b"as") {
699        return Some(Token::As);
700    }
701    if eq_ci(b, b"by") {
702        return Some(Token::By);
703    }
704    if eq_ci(b, b"in") {
705        return Some(Token::In);
706    }
707    if eq_ci(b, b"is") {
708        return Some(Token::Is);
709    }
710    if eq_ci(b, b"on") {
711        return Some(Token::On);
712    }
713    if eq_ci(b, b"or") {
714        return Some(Token::Or);
715    }
716    if eq_ci(b, b"to") {
717        return Some(Token::To);
718    }
719    None
720}
721
722#[inline]
723fn kw_len3(b: &[u8]) -> Option<Token> {
724    // 5 keywords: all, and, asc, not, for
725    if eq_ci(b, b"for") {
726        return Some(Token::For);
727    }
728    if eq_ci(b, b"all") {
729        return Some(Token::All);
730    }
731    if eq_ci(b, b"and") {
732        return Some(Token::And);
733    }
734    if eq_ci(b, b"asc") {
735        return Some(Token::Asc);
736    }
737    if eq_ci(b, b"not") {
738        return Some(Token::Not);
739    }
740    None
741}
742
743#[inline]
744fn kw_len4(b: &[u8]) -> Option<Token> {
745    // 10 keywords: from, null, true, into, like, join, left, show, desc, drop
746    if eq_ci(b, b"from") {
747        return Some(Token::From);
748    }
749    if eq_ci(b, b"drop") {
750        return Some(Token::Drop);
751    }
752    if eq_ci(b, b"null") {
753        return Some(Token::Null);
754    }
755    if eq_ci(b, b"true") {
756        return Some(Token::True);
757    }
758    if eq_ci(b, b"into") {
759        return Some(Token::Into);
760    }
761    if eq_ci(b, b"like") {
762        return Some(Token::Like);
763    }
764    if eq_ci(b, b"join") {
765        return Some(Token::Join);
766    }
767    if eq_ci(b, b"left") {
768        return Some(Token::Left);
769    }
770    if eq_ci(b, b"show") {
771        return Some(Token::Show);
772    }
773    if eq_ci(b, b"desc") {
774        return Some(Token::Desc);
775    }
776    None
777}
778
779#[inline]
780fn kw_len5(b: &[u8]) -> Option<Token> {
781    // 12 keywords: false, where, table, index, begin, order, limit,
782    // group, union, inner, cross, outer
783    if eq_ci(b, b"false") {
784        return Some(Token::False);
785    }
786    if eq_ci(b, b"where") {
787        return Some(Token::Where);
788    }
789    if eq_ci(b, b"table") {
790        return Some(Token::Table);
791    }
792    if eq_ci(b, b"index") {
793        return Some(Token::Index);
794    }
795    if eq_ci(b, b"begin") {
796        return Some(Token::Begin);
797    }
798    if eq_ci(b, b"order") {
799        return Some(Token::Order);
800    }
801    if eq_ci(b, b"limit") {
802        return Some(Token::Limit);
803    }
804    if eq_ci(b, b"group") {
805        return Some(Token::Group);
806    }
807    if eq_ci(b, b"union") {
808        return Some(Token::Union);
809    }
810    if eq_ci(b, b"inner") {
811        return Some(Token::Inner);
812    }
813    if eq_ci(b, b"cross") {
814        return Some(Token::Cross);
815    }
816    if eq_ci(b, b"outer") {
817        return Some(Token::Outer);
818    }
819    None
820}
821
822#[inline]
823fn kw_len6(b: &[u8]) -> Option<Token> {
824    // 9 keywords: select, create, insert, values, commit, having, offset, tables, except
825    if eq_ci(b, b"select") {
826        return Some(Token::Select);
827    }
828    if eq_ci(b, b"tables") {
829        return Some(Token::Tables);
830    }
831    if eq_ci(b, b"except") {
832        return Some(Token::Except);
833    }
834    if eq_ci(b, b"create") {
835        return Some(Token::Create);
836    }
837    if eq_ci(b, b"insert") {
838        return Some(Token::Insert);
839    }
840    if eq_ci(b, b"values") {
841        return Some(Token::Values);
842    }
843    if eq_ci(b, b"commit") {
844        return Some(Token::Commit);
845    }
846    if eq_ci(b, b"having") {
847        return Some(Token::Having);
848    }
849    if eq_ci(b, b"offset") {
850        return Some(Token::Offset);
851    }
852    None
853}
854
855#[inline]
856fn kw_len7(b: &[u8]) -> Option<Token> {
857    // 4 keywords: between, default, release, extract
858    if eq_ci(b, b"between") {
859        return Some(Token::Between);
860    }
861    if eq_ci(b, b"default") {
862        return Some(Token::Default);
863    }
864    if eq_ci(b, b"release") {
865        return Some(Token::Release);
866    }
867    if eq_ci(b, b"extract") {
868        return Some(Token::Extract);
869    }
870    None
871}
872
873#[inline]
874fn kw_len8(b: &[u8]) -> Option<Token> {
875    // 3 keywords: rollback, distinct, interval
876    if eq_ci(b, b"rollback") {
877        return Some(Token::Rollback);
878    }
879    if eq_ci(b, b"distinct") {
880        return Some(Token::Distinct);
881    }
882    if eq_ci(b, b"interval") {
883        return Some(Token::Interval);
884    }
885    None
886}
887
888#[inline]
889fn kw_len9(b: &[u8]) -> Option<Token> {
890    // 1 keyword: savepoint
891    if eq_ci(b, b"savepoint") {
892        return Some(Token::Savepoint);
893    }
894    None
895}
896
897#[inline]
898fn kw_len10(b: &[u8]) -> Option<Token> {
899    // 1 keyword: connection
900    if eq_ci(b, b"connection") {
901        return Some(Token::Connection);
902    }
903    None
904}
905
906#[inline]
907fn kw_len11(b: &[u8]) -> Option<Token> {
908    // 1 keyword: publication
909    if eq_ci(b, b"publication") {
910        return Some(Token::Publication);
911    }
912    None
913}
914
915#[inline]
916fn kw_len12(b: &[u8]) -> Option<Token> {
917    // 1 keyword: subscription
918    if eq_ci(b, b"subscription") {
919        return Some(Token::Subscription);
920    }
921    None
922}
923
924/// Lex a `'...'` string literal or `"..."` quoted identifier. The opening
925/// quote sits at `input[start]`; `quote` is its byte value. `is_ident` selects
926/// the resulting token shape.
927///
928/// PG-style doubling escapes the quote: `''` inside `'...'` is a literal `'`,
929/// same for `""` inside `"..."`.
930fn lex_quoted(
931    input: &str,
932    start: usize,
933    quote: u8,
934    is_ident: bool,
935) -> Result<(Token, usize), LexError> {
936    let bytes = input.as_bytes();
937    let mut i = start + 1;
938    let mut s = String::new();
939    loop {
940        if i >= bytes.len() {
941            return Err(LexError {
942                kind: if is_ident {
943                    LexErrorKind::UnterminatedQuotedIdent
944                } else {
945                    LexErrorKind::UnterminatedString
946                },
947                pos: start,
948            });
949        }
950        if bytes[i] == quote {
951            if peek_eq(bytes, i + 1, quote) {
952                s.push(quote as char);
953                i += 2;
954            } else {
955                i += 1;
956                break;
957            }
958        } else {
959            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
960            s.push(ch);
961            i += ch.len_utf8();
962        }
963    }
964    let tok = if is_ident {
965        Token::QuotedIdent(s)
966    } else {
967        Token::String(s)
968    };
969    Ok((tok, i - start))
970}
971
972fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
973    let bytes = s.as_bytes();
974    let mut i = 0usize;
975    let mut is_float = false;
976
977    while i < bytes.len() && bytes[i].is_ascii_digit() {
978        i += 1;
979    }
980    if i < bytes.len() && bytes[i] == b'.' {
981        is_float = true;
982        i += 1;
983        while i < bytes.len() && bytes[i].is_ascii_digit() {
984            i += 1;
985        }
986    }
987    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
988        is_float = true;
989        i += 1;
990        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
991            i += 1;
992        }
993        let exp_start = i;
994        while i < bytes.len() && bytes[i].is_ascii_digit() {
995            i += 1;
996        }
997        if exp_start == i {
998            return Err(LexErrorKind::BadNumber(s[..i].to_string()));
999        }
1000    }
1001
1002    let lit = &s[..i];
1003    if is_float {
1004        lit.parse::<f64>()
1005            .map(|v| (Token::Float(v), i))
1006            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1007    } else {
1008        lit.parse::<i64>()
1009            .map(|v| (Token::Integer(v), i))
1010            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1011    }
1012}
1013
1014#[cfg(test)]
1015mod tests {
1016    use super::*;
1017    use alloc::vec;
1018
1019    fn lex(s: &str) -> Vec<Token> {
1020        tokenize(s).expect("lex ok")
1021    }
1022
1023    #[test]
1024    fn empty_yields_only_eof() {
1025        assert_eq!(lex(""), vec![Token::Eof]);
1026    }
1027
1028    #[test]
1029    fn whitespace_only_yields_only_eof() {
1030        assert_eq!(lex("   \t\n  "), vec![Token::Eof]);
1031    }
1032
1033    #[test]
1034    fn keywords_are_case_insensitive() {
1035        assert_eq!(
1036            lex("SELECT select Select"),
1037            vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1038        );
1039    }
1040
1041    #[test]
1042    fn identifiers_lowercase_ascii() {
1043        assert_eq!(
1044            lex("hello WORLD _x x1"),
1045            vec![
1046                Token::Ident("hello".into()),
1047                Token::Ident("world".into()),
1048                Token::Ident("_x".into()),
1049                Token::Ident("x1".into()),
1050                Token::Eof,
1051            ]
1052        );
1053    }
1054
1055    #[test]
1056    fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1057        assert_eq!(
1058            lex(r#""User Name" "a""b""#),
1059            vec![
1060                Token::QuotedIdent("User Name".into()),
1061                Token::QuotedIdent("a\"b".into()),
1062                Token::Eof,
1063            ]
1064        );
1065    }
1066
1067    #[test]
1068    fn integer_and_float_literals() {
1069        assert_eq!(
1070            lex("0 42 1.5 .5 1e10 2.5e-3"),
1071            vec![
1072                Token::Integer(0),
1073                Token::Integer(42),
1074                Token::Float(1.5),
1075                Token::Float(0.5),
1076                Token::Float(1e10),
1077                Token::Float(2.5e-3),
1078                Token::Eof,
1079            ]
1080        );
1081    }
1082
1083    #[test]
1084    fn negative_number_is_minus_then_integer() {
1085        // PG follows this: unary minus is a separate token, parser folds it.
1086        assert_eq!(
1087            lex("-42"),
1088            vec![Token::Minus, Token::Integer(42), Token::Eof]
1089        );
1090    }
1091
1092    #[test]
1093    fn string_literal_doubled_quote_escape() {
1094        assert_eq!(
1095            lex("'hello' 'it''s'"),
1096            vec![
1097                Token::String("hello".into()),
1098                Token::String("it's".into()),
1099                Token::Eof,
1100            ]
1101        );
1102    }
1103
1104    #[test]
1105    fn all_comparison_and_arithmetic_operators() {
1106        assert_eq!(
1107            lex("= <> != < <= > >= + - * /"),
1108            vec![
1109                Token::Eq,
1110                Token::NotEq,
1111                Token::NotEq,
1112                Token::Lt,
1113                Token::LtEq,
1114                Token::Gt,
1115                Token::GtEq,
1116                Token::Plus,
1117                Token::Minus,
1118                Token::Star,
1119                Token::Slash,
1120                Token::Eof,
1121            ]
1122        );
1123    }
1124
1125    #[test]
1126    fn punctuation() {
1127        assert_eq!(
1128            lex("( ) , ; ."),
1129            vec![
1130                Token::LParen,
1131                Token::RParen,
1132                Token::Comma,
1133                Token::Semicolon,
1134                Token::Dot,
1135                Token::Eof,
1136            ]
1137        );
1138    }
1139
1140    #[test]
1141    fn line_comment_skipped() {
1142        assert_eq!(
1143            lex("SELECT -- trailing junk\nFROM"),
1144            vec![Token::Select, Token::From, Token::Eof]
1145        );
1146    }
1147
1148    #[test]
1149    fn block_comment_skipped() {
1150        assert_eq!(
1151            lex("SELECT /* skipped */ 1"),
1152            vec![Token::Select, Token::Integer(1), Token::Eof]
1153        );
1154    }
1155
1156    #[test]
1157    fn unterminated_string_errors() {
1158        let err = tokenize("'oops").unwrap_err();
1159        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1160        assert_eq!(err.pos, 0);
1161    }
1162
1163    #[test]
1164    fn unterminated_block_comment_errors() {
1165        let err = tokenize("/* never closed").unwrap_err();
1166        assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1167    }
1168
1169    #[test]
1170    fn unknown_char_errors() {
1171        // v7.17.0 Phase 2.6 — `@` standalone now lexes as
1172        // Token::At (mysqldump `'user'@'host'` DEFINER stitching).
1173        // Use `?` for the unknown-char regression; PG `?` operator
1174        // family is parsed as JSON ops in the prefix `?` shape
1175        // would land in lex paths; bare `?` is unknown.
1176        let err = tokenize("\x07").unwrap_err();
1177        assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1178    }
1179
1180    #[test]
1181    fn at_alone_lexes_as_punctuation() {
1182        // v7.17.0 Phase 2.6 — the `'user'@'host'` MySQL DEFINER
1183        // form needs `@` to lex as a standalone token.
1184        assert_eq!(
1185            lex("'u'@'h'"),
1186            vec![
1187                Token::String("u".into()),
1188                Token::At,
1189                Token::String("h".into()),
1190                Token::Eof,
1191            ]
1192        );
1193    }
1194
1195    #[test]
1196    fn dot_in_qualified_column() {
1197        assert_eq!(
1198            lex("t.col"),
1199            vec![
1200                Token::Ident("t".into()),
1201                Token::Dot,
1202                Token::Ident("col".into()),
1203                Token::Eof,
1204            ]
1205        );
1206    }
1207
1208    // --- v0.11 brackets + distance op + vector keyword --------------------
1209
1210    #[test]
1211    fn brackets_are_distinct_tokens() {
1212        assert_eq!(
1213            lex("[ ]"),
1214            vec![Token::LBracket, Token::RBracket, Token::Eof]
1215        );
1216    }
1217
1218    #[test]
1219    fn l2_distance_is_three_char_token() {
1220        assert_eq!(
1221            lex("a <-> b"),
1222            vec![
1223                Token::Ident("a".into()),
1224                Token::L2Distance,
1225                Token::Ident("b".into()),
1226                Token::Eof,
1227            ]
1228        );
1229        // Bare `<-` should NOT match L2Distance.
1230        assert_eq!(
1231            lex("a <- b"),
1232            vec![
1233                Token::Ident("a".into()),
1234                Token::Lt,
1235                Token::Minus,
1236                Token::Ident("b".into()),
1237                Token::Eof,
1238            ]
1239        );
1240    }
1241
1242    #[test]
1243    fn order_by_limit_are_keywords() {
1244        assert_eq!(
1245            lex("ORDER BY LIMIT"),
1246            vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1247        );
1248    }
1249
1250    // --- v1.2: pgvector distance ops + PG cast --------------------------
1251
1252    #[test]
1253    fn inner_product_operator_3char() {
1254        assert_eq!(
1255            lex("a <#> b"),
1256            vec![
1257                Token::Ident("a".into()),
1258                Token::InnerProduct,
1259                Token::Ident("b".into()),
1260                Token::Eof,
1261            ]
1262        );
1263    }
1264
1265    #[test]
1266    fn cosine_distance_operator_3char() {
1267        assert_eq!(
1268            lex("a <=> b"),
1269            vec![
1270                Token::Ident("a".into()),
1271                Token::CosineDistance,
1272                Token::Ident("b".into()),
1273                Token::Eof,
1274            ]
1275        );
1276        // Make sure `<=` and `<>` and `<->` still lex right when `<=>` is
1277        // around (greedy match takes the longest).
1278        assert_eq!(
1279            lex("a <= b"),
1280            vec![
1281                Token::Ident("a".into()),
1282                Token::LtEq,
1283                Token::Ident("b".into()),
1284                Token::Eof,
1285            ]
1286        );
1287    }
1288
1289    #[test]
1290    fn double_colon_cast_token() {
1291        assert_eq!(
1292            lex("x::INT"),
1293            vec![
1294                Token::Ident("x".into()),
1295                Token::DoubleColon,
1296                Token::Ident("int".into()),
1297                Token::Eof,
1298            ]
1299        );
1300    }
1301
1302    #[test]
1303    fn lone_single_colon_lexes_as_colon_token() {
1304        // v7.12.4 — single `:` is now a token (PL/pgSQL surface
1305        // + tsvector external-form literal both need it). The
1306        // pre-v7.12.4 "single colon = unknown char" behaviour
1307        // was incidental.
1308        let toks = tokenize(":x").expect("colon now lexes");
1309        assert_eq!(toks[0], Token::Colon);
1310    }
1311
1312    #[test]
1313    fn colon_eq_lexes_as_assignment() {
1314        // v7.12.4 — PL/pgSQL assignment operator.
1315        let toks = tokenize("x := 1").expect("colon-eq lexes");
1316        // Tokens: Ident("x"), ColonEq, NumberLiteral
1317        assert!(matches!(toks[1], Token::ColonEq));
1318    }
1319}
spg_sql/lexer.rs

spg_sql/
lexer.rs