spg_sql/
lexer.rs

1//! Lexer for the PG-dialect subset that SPG accepts.
2//!
3//! v0.2 token stream is value-only — no source spans yet. Errors do report
4//! the byte offset where the offending construct started. Identifiers are
5//! ASCII case-folded to lower-case (matches PG when un-quoted). Quoted
6//! identifiers (`"..."`) preserve case; `""` is an embedded quote.
7//! String literals (`'...'`) follow PG single-quote convention with `''`
8//! as the embedded quote. The lexer accepts but does not interpret E-strings
9//! or dollar-quoted strings — those land in a later milestone.
10
11use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords
18    Select,
19    From,
20    Where,
21    As,
22    Null,
23    True,
24    False,
25    And,
26    Or,
27    Not,
28    Create,
29    Table,
30    Insert,
31    Into,
32    Values,
33    Index,
34    On,
35    Begin,
36    Commit,
37    Rollback,
38    Order,
39    By,
40    Limit,
41
42    // Identifiers
43    Ident(String),       // ASCII case-folded
44    QuotedIdent(String), // original case, "" → "
45    /// v7.14.0 — MySQL session / user variable reference
46    /// (`@VAR` / `@@VAR`). The wrapped string is the verbatim
47    /// source form (including the `@` / `@@` prefix). Used by
48    /// mysqldump preamble (`SET @OLD_FOREIGN_KEY_CHECKS =
49    /// @@FOREIGN_KEY_CHECKS, …`); SPG accepts the token and
50    /// the SET parser treats the assignment as a no-op apart
51    /// from any second LHS that targets a real session
52    /// parameter (e.g. `FOREIGN_KEY_CHECKS=0`).
53    SessionVar(String),
54
55    // Literals
56    Integer(i64),
57    Float(f64),
58    String(String),
59
60    // Operators
61    Plus,
62    Minus,
63    Star,
64    Slash,
65    Eq,
66    NotEq,
67    Lt,
68    LtEq,
69    Gt,
70    GtEq,
71    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contained-in
72    /// `<<`. LHS is strictly inside RHS (no equality).
73    InetContainedBy,
74    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contained-in-or-equal
75    /// `<<=`. LHS network ⊆ RHS network.
76    InetContainedByEq,
77    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contains `>>`.
78    /// LHS strictly contains RHS.
79    InetContains,
80    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contains-or-equal `>>=`.
81    /// LHS network ⊇ RHS network.
82    InetContainsEq,
83    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR network overlap `&&`.
84    /// Either side contains any address of the other.
85    InetOverlap,
86
87    // Punctuation
88    LParen,
89    RParen,
90    LBracket,
91    RBracket,
92    Comma,
93    Semicolon,
94    Dot,
95    /// v7.17.0 Phase 2.6 — standalone `@` punctuation. Emitted when
96    /// `@` is NOT followed by an ident-start byte (i.e. the
97    /// `@VAR` / `@@VAR` SessionVar path doesn't match). Lets the
98    /// parser stitch the MySQL `'user'@'host'` DEFINER form back
99    /// together as String + At + String. Pre-2.6 this same shape
100    /// surfaced as a `LexErrorKind::UnknownChar('@')` and broke
101    /// every mysqldump CREATE VIEW with a DEFINER clause at lex
102    /// time.
103    At,
104    /// pgvector L2 distance operator `<->`. Lexed as one token so the
105    /// parser can give it its own precedence rung.
106    /// v4.14 `->` — JSON object/array element access, returns json.
107    JsonGet,
108    /// v4.14 `->>` — same access, returns text.
109    JsonGetText,
110    /// v6.4.5 `#>` — JSON path walk, returns json. Path is the
111    /// right-hand TEXT with PG `{a,b,0}` syntax.
112    JsonGetPath,
113    /// v6.4.5 `#>>` — same walk, returns text.
114    JsonGetPathText,
115    /// v6.4.5 `@>` — JSON containment. `j @> sub` returns true if
116    /// every key/value in `sub` is present in `j` with structural
117    /// containment for objects + arrays.
118    JsonContains,
119    /// v7.12.2 `@@` — tsvector / tsquery match. Either ordering
120    /// (`vec @@ q` or `q @@ vec`) parses; engine eval normalises
121    /// before matching.
122    TsMatch,
123    L2Distance,
124    /// pgvector inner-product operator `<#>` (returns negative dot product
125    /// so smaller still means more similar — same semantics as pgvector).
126    InnerProduct,
127    /// pgvector cosine distance operator `<=>`.
128    CosineDistance,
129    /// PG-style cast `expr::type` — single token because we want it to bind
130    /// at postfix precedence.
131    DoubleColon,
132    /// v7.12.4 — PL/pgSQL assignment operator `:=`.
133    /// Outside PL/pgSQL bodies this token has no SQL-side meaning.
134    ColonEq,
135    /// v7.12.4 — bare `:` separator. Used inside `tsvector` external-form
136    /// literals (`'cat:1 dog:2'::tsvector`) and as the fallback path for
137    /// the PL/pgSQL assignment lexer.
138    Colon,
139    /// Standard SQL string concatenation `||`.
140    Concat,
141    /// Bitwise OR `|` (single pipe — `||` lexes as Concat first).
142    Pipe,
143    /// Bitwise AND `&` (single amp — `&&` lexes as InetOverlap first).
144    Amp,
145    /// Bitwise NOT `~` (prefix).
146    Tilde,
147    /// `IS` keyword — postfix `IS NULL` / `IS NOT NULL` predicates.
148    Is,
149    Between,
150    In,
151    Like,
152    Group,
153    Distinct,
154    Union,
155    All,
156    Join,
157    Inner,
158    Left,
159    Cross,
160    Outer,
161    Default,
162    Savepoint,
163    Release,
164    To,
165    Having,
166    Show,
167    Extract,
168    Offset,
169    Asc,
170    Desc,
171    /// `INTERVAL` — followed by a string literal carrying the span text
172    /// (e.g. `INTERVAL '1 day 2 hours'`).
173    Interval,
174    /// v6.1.1 — `$N` parameter placeholder for the extended query
175    /// protocol. The number N is 1-based per PostgreSQL convention.
176    /// `0` and `$0` are not valid; the lexer rejects them.
177    Placeholder(u16),
178
179    /// v6.1.2 — `DROP` keyword. Used by `DROP PUBLICATION <name>`.
180    /// Reserved for future `DROP TABLE` / `DROP INDEX` / `DROP USER`
181    /// surface that currently goes through SHOW-shaped admin SQL.
182    Drop,
183    /// v6.1.2 — `FOR` keyword (publication scope).
184    For,
185    /// v6.1.2 — `TABLES` plural keyword (`FOR ALL TABLES`,
186    /// `FOR ALL TABLES EXCEPT …`). The existing `TABLE` keyword
187    /// stays a separate token so `CREATE TABLE`'s single-table
188    /// form keeps lexing as today.
189    Tables,
190    /// v6.1.3 (reserved at v6.1.2 to keep the AST shape stable) —
191    /// `EXCEPT` keyword for `FOR ALL TABLES EXCEPT t1, t2`.
192    Except,
193    /// v6.1.2 — `PUBLICATION` keyword.
194    Publication,
195    /// v6.1.4 (reserved at v6.1.2) — `SUBSCRIPTION` keyword.
196    Subscription,
197    /// v6.1.4 — `CONNECTION` keyword (for
198    /// `CREATE SUBSCRIPTION … CONNECTION '<conn_str>' …`).
199    Connection,
200
201    Eof,
202}
203
204#[derive(Debug, Clone, PartialEq, Eq)]
205pub enum LexErrorKind {
206    UnknownChar(char),
207    UnterminatedString,
208    UnterminatedQuotedIdent,
209    UnterminatedBlockComment,
210    BadNumber(String),
211}
212
213#[derive(Debug, Clone, PartialEq, Eq)]
214pub struct LexError {
215    pub kind: LexErrorKind,
216    pub pos: usize,
217}
218
219impl fmt::Display for LexError {
220    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
221        match &self.kind {
222            LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
223            LexErrorKind::UnterminatedString => {
224                write!(f, "unterminated string literal at byte {}", self.pos)
225            }
226            LexErrorKind::UnterminatedQuotedIdent => {
227                write!(f, "unterminated quoted identifier at byte {}", self.pos)
228            }
229            LexErrorKind::UnterminatedBlockComment => {
230                write!(f, "unterminated /* */ comment at byte {}", self.pos)
231            }
232            LexErrorKind::BadNumber(s) => {
233                write!(f, "invalid number literal {s:?} at byte {}", self.pos)
234            }
235        }
236    }
237}
238
239/// Tokenize `input` into a `Vec<Token>` ending in `Token::Eof`,
240/// with PG string semantics (backslash is a literal byte inside
241/// `'…'`; `''` is the only escape).
242pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
243    tokenize_with(input, false)
244}
245
246/// v7.22 (round-13 T3) — dialect-aware tokenizer entry. With
247/// `backslash_escapes = true`, plain `'…'` strings honour MySQL /
248/// pre-9.1-PG backslash escapes (`\'` `\\` `\n` …, the same decode
249/// the `E'…'` form uses). mysqldump ALWAYS emits `\'`-escaped data
250/// sections, and pg_dump ALWAYS announces PG semantics via
251/// `SET standard_conforming_strings = on` — the engine flips this
252/// flag off/on from those deterministic session signals.
253#[allow(clippy::too_many_lines)] // big match — splitting would obscure the dispatch table
254pub fn tokenize_with(input: &str, backslash_escapes: bool) -> Result<Vec<Token>, LexError> {
255    let bytes = input.as_bytes();
256    let mut i = 0usize;
257    let mut out = Vec::new();
258
259    while i < bytes.len() {
260        let b = bytes[i];
261        match b {
262            b' ' | b'\t' | b'\n' | b'\r' => {
263                i += 1;
264            }
265            b'-' if peek_eq(bytes, i + 1, b'-') => {
266                i += 2;
267                while i < bytes.len() && bytes[i] != b'\n' {
268                    i += 1;
269                }
270            }
271            b'/' if peek_eq(bytes, i + 1, b'*') => {
272                let start = i;
273                // v7.14.0 — MySQL versioned conditional comment
274                // `/*!NNNNN <body> */`. The body is real SQL that
275                // MySQL/MariaDB executes when the runtime version
276                // matches the 5-digit code; PG strips the whole
277                // thing as a block comment. SPG sides with MySQL
278                // semantics for dump compatibility: skip the
279                // `/*!NNNNN ` prefix and continue lexing the body
280                // as ordinary tokens. The closing `*/` is later
281                // matched + skipped by the symmetric arm below.
282                if peek_eq(bytes, i + 2, b'!') {
283                    let mut j = i + 3;
284                    // skip the optional 5-digit version code +
285                    // following single whitespace
286                    while j < bytes.len() && bytes[j].is_ascii_digit() {
287                        j += 1;
288                    }
289                    if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
290                        j += 1;
291                    }
292                    i = j;
293                    continue;
294                }
295                i += 2;
296                let mut closed = false;
297                while i + 1 < bytes.len() {
298                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
299                        i += 2;
300                        closed = true;
301                        break;
302                    }
303                    i += 1;
304                }
305                if !closed {
306                    return Err(LexError {
307                        kind: LexErrorKind::UnterminatedBlockComment,
308                        pos: start,
309                    });
310                }
311            }
312            // v7.14.0 — bare `*/` (closing of the v7.14 MySQL
313            // versioned-comment opener that didn't consume the
314            // closer). We treat it as an inline comment terminator
315            // and skip 2 bytes.
316            b'*' if peek_eq(bytes, i + 1, b'/') => {
317                i += 2;
318            }
319            b'\'' => {
320                let (tok, consumed) = if backslash_escapes {
321                    // MySQL-dialect session: plain strings decode
322                    // backslash escapes — same machinery as E'…'.
323                    lex_escape_string(input, i)?
324                } else {
325                    lex_quoted(input, i, b'\'', false)?
326                };
327                out.push(tok);
328                i += consumed;
329            }
330            // v7.18 — PG escape-string literal `E'...'` / `e'...'`.
331            // Closes the mailrs D-pre #3 reverse-acceptance gap:
332            // `INSERT INTO oq VALUES (E'\\xdeadbeef'::bytea)` needs
333            // the `E` prefix so `\\` decodes to a single `\`. The
334            // produced Token::String carries the decoded body so
335            // downstream parser / cast paths treat it identically
336            // to a regular string literal.
337            b'E' | b'e' if peek_eq(bytes, i + 1, b'\'') => {
338                let (tok, consumed) = lex_escape_string(input, i + 1)?;
339                out.push(tok);
340                i += 1 + consumed;
341            }
342            b'"' => {
343                let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
344                out.push(tok);
345                i += consumed;
346            }
347            // MySQL-flavoured backtick-quoted identifier. Same semantics
348            // as the standard `"..."` form, including embedded "``" as
349            // a literal backtick.
350            b'`' => {
351                let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
352                out.push(tok);
353                i += consumed;
354            }
355            b if b.is_ascii_alphabetic() || b == b'_' => {
356                let start = i;
357                i += 1;
358                while i < bytes.len() {
359                    let c = bytes[i];
360                    if c.is_ascii_alphanumeric() || c == b'_' {
361                        i += 1;
362                    } else {
363                        break;
364                    }
365                }
366                let raw = &input[start..i];
367                // v3.0.5: try the keyword table case-insensitively
368                // without allocating; only the ident fall-through
369                // pays for a lowercase String.
370                out.push(keyword_or_ident_raw(raw));
371            }
372            b if b.is_ascii_digit() => {
373                let (tok, consumed) =
374                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
375                out.push(tok);
376                i += consumed;
377            }
378            b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
379                let (tok, consumed) =
380                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
381                out.push(tok);
382                i += consumed;
383            }
384            b'+' => single(&mut out, Token::Plus, &mut i),
385            b'-' => {
386                // v4.14: `->>` and `->` for JSON path access. `->>`
387                // must be tried before `->` (longest match).
388                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
389                    out.push(Token::JsonGetText);
390                    i += 3;
391                } else if peek_eq(bytes, i + 1, b'>') {
392                    out.push(Token::JsonGet);
393                    i += 2;
394                } else {
395                    single(&mut out, Token::Minus, &mut i);
396                }
397            }
398            // v6.4.5: `#>>` and `#>` JSON path walk.
399            b'#' => {
400                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
401                    out.push(Token::JsonGetPathText);
402                    i += 3;
403                } else if peek_eq(bytes, i + 1, b'>') {
404                    out.push(Token::JsonGetPath);
405                    i += 2;
406                } else {
407                    return Err(LexError {
408                        kind: LexErrorKind::UnknownChar('#'),
409                        pos: i,
410                    });
411                }
412            }
413            // v6.4.5: `@>` JSON containment.
414            // v7.12.2: `@@` tsvector / tsquery match.
415            // v7.14.0: `@@NAME` MySQL session variable ref +
416            //          `@NAME` user variable ref. mysqldump preamble
417            //          uses both heavily (`SET @OLD_FOREIGN_KEY_CHECKS
418            //          = @@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0`).
419            //          We lex both as a single SessionVar token so
420            //          the parser can accept and ignore them.
421            b'@' => {
422                if peek_eq(bytes, i + 1, b'>') {
423                    out.push(Token::JsonContains);
424                    i += 2;
425                } else if peek_eq(bytes, i + 1, b'@')
426                    && !is_session_var_ident_start(bytes.get(i + 2).copied())
427                {
428                    // `@@` not followed by an ident-start byte is
429                    // the tsquery `@@` operator.
430                    out.push(Token::TsMatch);
431                    i += 2;
432                } else {
433                    // `@VAR` / `@@VAR` — MySQL user / session
434                    // variable reference. Consume the ident-shaped
435                    // tail and emit as Token::SessionVar so the
436                    // SET parser can accept-and-ignore.
437                    let prefix_end = if peek_eq(bytes, i + 1, b'@') {
438                        i + 2
439                    } else {
440                        i + 1
441                    };
442                    let mut end = prefix_end;
443                    while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
444                        end += 1;
445                    }
446                    if end == prefix_end {
447                        // v7.17.0 Phase 2.6 — `@` not followed by an
448                        // ident-shaped tail. mysqldump's DEFINER
449                        // form `'user'@'host'` lands here (next
450                        // byte is `'`). Emit as Token::At so the
451                        // parser can stitch the surrounding String
452                        // tokens. Single `@@` already short-circuits
453                        // to Token::TsMatch above, so this only
454                        // fires for a true lone `@`.
455                        out.push(Token::At);
456                        i = prefix_end;
457                        continue;
458                    }
459                    out.push(Token::SessionVar(input[i..end].to_string()));
460                    i = end;
461                }
462            }
463            b'*' => single(&mut out, Token::Star, &mut i),
464            b'/' => single(&mut out, Token::Slash, &mut i),
465            b'(' => single(&mut out, Token::LParen, &mut i),
466            b')' => single(&mut out, Token::RParen, &mut i),
467            b'[' => single(&mut out, Token::LBracket, &mut i),
468            b']' => single(&mut out, Token::RBracket, &mut i),
469            b',' => single(&mut out, Token::Comma, &mut i),
470            b';' => single(&mut out, Token::Semicolon, &mut i),
471            b'.' => single(&mut out, Token::Dot, &mut i),
472            b'=' => single(&mut out, Token::Eq, &mut i),
473            b'<' => {
474                if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
475                    out.push(Token::CosineDistance);
476                    i += 3;
477                } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
478                    out.push(Token::InnerProduct);
479                    i += 3;
480                } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
481                    out.push(Token::L2Distance);
482                    i += 3;
483                } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
484                    // v7.17.0 Phase 3.P0-47 — PG INET `<<=` contained-or-equal.
485                    out.push(Token::InetContainedByEq);
486                    i += 3;
487                } else if peek_eq(bytes, i + 1, b'<') {
488                    // v7.17.0 Phase 3.P0-47 — PG INET `<<` strict contained.
489                    out.push(Token::InetContainedBy);
490                    i += 2;
491                } else if peek_eq(bytes, i + 1, b'=') {
492                    out.push(Token::LtEq);
493                    i += 2;
494                } else if peek_eq(bytes, i + 1, b'>') {
495                    out.push(Token::NotEq);
496                    i += 2;
497                } else {
498                    out.push(Token::Lt);
499                    i += 1;
500                }
501            }
502            b':' if peek_eq(bytes, i + 1, b':') => {
503                out.push(Token::DoubleColon);
504                i += 2;
505            }
506            b':' if peek_eq(bytes, i + 1, b'=') => {
507                // v7.12.4 — PL/pgSQL assignment operator `:=`.
508                out.push(Token::ColonEq);
509                i += 2;
510            }
511            b':' => {
512                // v7.12.4 — bare `:`. Used inside `tsvector` external-form
513                // literals which the cast parser consumes in-token, and as a
514                // separator the PL/pgSQL assignment lexer can recover from.
515                out.push(Token::Colon);
516                i += 1;
517            }
518            b'|' if peek_eq(bytes, i + 1, b'|') => {
519                out.push(Token::Concat);
520                i += 2;
521            }
522            // Bitwise operators (PG integer ops; mailrs IMAP flag
523            // masks: `flags | $1`, `flags & ~$1`).
524            b'|' => {
525                single(&mut out, Token::Pipe, &mut i);
526            }
527            b'~' => {
528                single(&mut out, Token::Tilde, &mut i);
529            }
530            b'>' => {
531                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
532                    // v7.17.0 Phase 3.P0-47 — PG INET `>>=` contains-or-equal.
533                    out.push(Token::InetContainsEq);
534                    i += 3;
535                } else if peek_eq(bytes, i + 1, b'>') {
536                    // v7.17.0 Phase 3.P0-47 — PG INET `>>` strict contains.
537                    out.push(Token::InetContains);
538                    i += 2;
539                } else if peek_eq(bytes, i + 1, b'=') {
540                    out.push(Token::GtEq);
541                    i += 2;
542                } else {
543                    out.push(Token::Gt);
544                    i += 1;
545                }
546            }
547            b'&' if peek_eq(bytes, i + 1, b'&') => {
548                // v7.17.0 Phase 3.P0-47 — PG INET network overlap `&&`.
549                out.push(Token::InetOverlap);
550                i += 2;
551            }
552            b'&' => {
553                single(&mut out, Token::Amp, &mut i);
554            }
555            b'!' if peek_eq(bytes, i + 1, b'=') => {
556                out.push(Token::NotEq);
557                i += 2;
558            }
559            // v7.9.27 — PG dollar-quoted string `$$ … $$` (or
560            // `$tag$ … $tag$`). Used in `DO $$ … $$ LANGUAGE
561            // plpgsql;` blocks that pg_dump emits for idempotent
562            // migrations. SPG has no PL/pgSQL, so the lexer
563            // consumes the entire string as a single Token::String
564            // and the parser treats the surrounding `DO …;` as a
565            // no-op. mailrs follow-up H1.
566            b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
567                // Empty tag form: `$$ … $$`.
568                let end = find_dollar_tag_end(bytes, i + 2, b"$$");
569                let body = match end {
570                    Some(e) => &input[i + 2..e],
571                    None => {
572                        return Err(LexError {
573                            kind: LexErrorKind::UnterminatedString,
574                            pos: i,
575                        });
576                    }
577                };
578                out.push(Token::String(body.to_string()));
579                i = end.unwrap() + 2;
580            }
581            b'$' if i + 1 < bytes.len()
582                && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
583            {
584                // Tagged form: `$foo$ … $foo$`. Scan the tag
585                // ident, find the closing copy.
586                let mut j = i + 1;
587                while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
588                    j += 1;
589                }
590                if j >= bytes.len() || bytes[j] != b'$' {
591                    // Not a dollar-quoted string — fall through
592                    // to the generic-unknown-char path.
593                    let ch = input[i..].chars().next().unwrap_or('?');
594                    return Err(LexError {
595                        kind: LexErrorKind::UnknownChar(ch),
596                        pos: i,
597                    });
598                }
599                let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
600                let end = find_dollar_tag_end(bytes, j + 1, &close);
601                let body = match end {
602                    Some(e) => &input[j + 1..e],
603                    None => {
604                        return Err(LexError {
605                            kind: LexErrorKind::UnterminatedString,
606                            pos: i,
607                        });
608                    }
609                };
610                out.push(Token::String(body.to_string()));
611                i = end.unwrap() + close.len();
612            }
613            // v6.1.1: `$N` parameter placeholder for the extended
614            // query protocol. PG numbers them 1..=N; we reject $0
615            // and a bare `$` not followed by a digit.
616            b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
617                let mut j = i + 1;
618                let mut n: u32 = 0;
619                while j < bytes.len() && bytes[j].is_ascii_digit() {
620                    n = n
621                        .saturating_mul(10)
622                        .saturating_add(u32::from(bytes[j] - b'0'));
623                    j += 1;
624                }
625                if n == 0 || n > u32::from(u16::MAX) {
626                    return Err(LexError {
627                        kind: LexErrorKind::BadNumber(input[i..j].to_string()),
628                        pos: i,
629                    });
630                }
631                #[allow(clippy::cast_possible_truncation)]
632                out.push(Token::Placeholder(n as u16));
633                i = j;
634            }
635            _ => {
636                let ch = input[i..].chars().next().unwrap_or('?');
637                return Err(LexError {
638                    kind: LexErrorKind::UnknownChar(ch),
639                    pos: i,
640                });
641            }
642        }
643    }
644    out.push(Token::Eof);
645    Ok(out)
646}
647
648fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
649    bytes.get(i) == Some(&target)
650}
651
652/// v7.14.0 — recognise the first byte of a MySQL session/user
653/// variable name (after `@` or `@@`). PG-strict idents are ASCII
654/// letter or underscore; MySQL also allows leading digits inside
655/// quoted names but unquoted vars match the same shape.
656fn is_session_var_ident_start(b: Option<u8>) -> bool {
657    matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
658}
659
660/// Continuation byte for a `@VAR`/`@@VAR` ident (after the first
661/// alphabet/underscore byte). Letters, digits, underscore, dot
662/// (MySQL allows session-scope qualifiers like
663/// `@@global.sql_mode`) and `$` (some MySQL versions accept it).
664fn is_session_var_ident_continue(b: u8) -> bool {
665    b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
666}
667
668/// v7.9.27 — find the start index of the next occurrence of `tag`
669/// (e.g. `b"$$"` or `b"$foo$"`) in `bytes` starting at `from`.
670fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
671    if tag.is_empty() || from > bytes.len() {
672        return None;
673    }
674    let mut i = from;
675    while i + tag.len() <= bytes.len() {
676        if &bytes[i..i + tag.len()] == tag {
677            return Some(i);
678        }
679        i += 1;
680    }
681    None
682}
683
684fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
685    bytes.get(i).is_some_and(pred)
686}
687
688fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
689    out.push(tok);
690    *i += 1;
691}
692
693/// Length-first ASCII-CI keyword lookup. Avoids allocating a
694/// lowercase `String` when the input matches a keyword; only the ident
695/// fall-through path pays for the lowercase copy.
696///
697/// Grouped by length so the outer `match` becomes a small jump table.
698/// Within a length bucket every keyword has either a unique first
699/// byte (cheap dispatch) or a small set of disambiguating
700/// trailing-byte comparisons. All comparisons are ASCII-CI (XOR
701/// 0x20 on each byte before the compare).
702fn keyword_or_ident_raw(raw: &str) -> Token {
703    let b = raw.as_bytes();
704    let tok = match b.len() {
705        2 => kw_len2(b),
706        3 => kw_len3(b),
707        4 => kw_len4(b),
708        5 => kw_len5(b),
709        6 => kw_len6(b),
710        7 => kw_len7(b),
711        8 => kw_len8(b),
712        9 => kw_len9(b),
713        10 => kw_len10(b),
714        11 => kw_len11(b),
715        12 => kw_len12(b),
716        _ => None,
717    };
718    match tok {
719        Some(t) => t,
720        // Ident fall-through: this is the only path that allocates.
721        None => Token::Ident(raw.to_ascii_lowercase()),
722    }
723}
724
725/// ASCII-CI equality on a byte slice against a lowercase literal.
726/// Letters that differ only in case satisfy `(a ^ b) == 0x20`; other
727/// mismatches set bits outside the 0x20 mask. We compare each byte
728/// against its lowercase form via `to_ascii_lowercase` for clarity;
729/// the compiler folds the loop into a tight cmov chain.
730#[inline]
731fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
732    if input.len() != lower.len() {
733        return false;
734    }
735    for i in 0..lower.len() {
736        if input[i].to_ascii_lowercase() != lower[i] {
737            return false;
738        }
739    }
740    true
741}
742
743#[inline]
744fn kw_len2(b: &[u8]) -> Option<Token> {
745    // 7 keywords: as, by, in, is, on, or, to
746    if eq_ci(b, b"as") {
747        return Some(Token::As);
748    }
749    if eq_ci(b, b"by") {
750        return Some(Token::By);
751    }
752    if eq_ci(b, b"in") {
753        return Some(Token::In);
754    }
755    if eq_ci(b, b"is") {
756        return Some(Token::Is);
757    }
758    if eq_ci(b, b"on") {
759        return Some(Token::On);
760    }
761    if eq_ci(b, b"or") {
762        return Some(Token::Or);
763    }
764    if eq_ci(b, b"to") {
765        return Some(Token::To);
766    }
767    None
768}
769
770#[inline]
771fn kw_len3(b: &[u8]) -> Option<Token> {
772    // 5 keywords: all, and, asc, not, for
773    if eq_ci(b, b"for") {
774        return Some(Token::For);
775    }
776    if eq_ci(b, b"all") {
777        return Some(Token::All);
778    }
779    if eq_ci(b, b"and") {
780        return Some(Token::And);
781    }
782    if eq_ci(b, b"asc") {
783        return Some(Token::Asc);
784    }
785    if eq_ci(b, b"not") {
786        return Some(Token::Not);
787    }
788    None
789}
790
791#[inline]
792fn kw_len4(b: &[u8]) -> Option<Token> {
793    // 10 keywords: from, null, true, into, like, join, left, show, desc, drop
794    if eq_ci(b, b"from") {
795        return Some(Token::From);
796    }
797    if eq_ci(b, b"drop") {
798        return Some(Token::Drop);
799    }
800    if eq_ci(b, b"null") {
801        return Some(Token::Null);
802    }
803    if eq_ci(b, b"true") {
804        return Some(Token::True);
805    }
806    if eq_ci(b, b"into") {
807        return Some(Token::Into);
808    }
809    if eq_ci(b, b"like") {
810        return Some(Token::Like);
811    }
812    if eq_ci(b, b"join") {
813        return Some(Token::Join);
814    }
815    if eq_ci(b, b"left") {
816        return Some(Token::Left);
817    }
818    if eq_ci(b, b"show") {
819        return Some(Token::Show);
820    }
821    if eq_ci(b, b"desc") {
822        return Some(Token::Desc);
823    }
824    None
825}
826
827#[inline]
828fn kw_len5(b: &[u8]) -> Option<Token> {
829    // 12 keywords: false, where, table, index, begin, order, limit,
830    // group, union, inner, cross, outer
831    if eq_ci(b, b"false") {
832        return Some(Token::False);
833    }
834    if eq_ci(b, b"where") {
835        return Some(Token::Where);
836    }
837    if eq_ci(b, b"table") {
838        return Some(Token::Table);
839    }
840    if eq_ci(b, b"index") {
841        return Some(Token::Index);
842    }
843    if eq_ci(b, b"begin") {
844        return Some(Token::Begin);
845    }
846    if eq_ci(b, b"order") {
847        return Some(Token::Order);
848    }
849    if eq_ci(b, b"limit") {
850        return Some(Token::Limit);
851    }
852    if eq_ci(b, b"group") {
853        return Some(Token::Group);
854    }
855    if eq_ci(b, b"union") {
856        return Some(Token::Union);
857    }
858    if eq_ci(b, b"inner") {
859        return Some(Token::Inner);
860    }
861    if eq_ci(b, b"cross") {
862        return Some(Token::Cross);
863    }
864    if eq_ci(b, b"outer") {
865        return Some(Token::Outer);
866    }
867    None
868}
869
870#[inline]
871fn kw_len6(b: &[u8]) -> Option<Token> {
872    // 9 keywords: select, create, insert, values, commit, having, offset, tables, except
873    if eq_ci(b, b"select") {
874        return Some(Token::Select);
875    }
876    if eq_ci(b, b"tables") {
877        return Some(Token::Tables);
878    }
879    if eq_ci(b, b"except") {
880        return Some(Token::Except);
881    }
882    if eq_ci(b, b"create") {
883        return Some(Token::Create);
884    }
885    if eq_ci(b, b"insert") {
886        return Some(Token::Insert);
887    }
888    if eq_ci(b, b"values") {
889        return Some(Token::Values);
890    }
891    if eq_ci(b, b"commit") {
892        return Some(Token::Commit);
893    }
894    if eq_ci(b, b"having") {
895        return Some(Token::Having);
896    }
897    if eq_ci(b, b"offset") {
898        return Some(Token::Offset);
899    }
900    None
901}
902
903#[inline]
904fn kw_len7(b: &[u8]) -> Option<Token> {
905    // 4 keywords: between, default, release, extract
906    if eq_ci(b, b"between") {
907        return Some(Token::Between);
908    }
909    if eq_ci(b, b"default") {
910        return Some(Token::Default);
911    }
912    if eq_ci(b, b"release") {
913        return Some(Token::Release);
914    }
915    if eq_ci(b, b"extract") {
916        return Some(Token::Extract);
917    }
918    None
919}
920
921#[inline]
922fn kw_len8(b: &[u8]) -> Option<Token> {
923    // 3 keywords: rollback, distinct, interval
924    if eq_ci(b, b"rollback") {
925        return Some(Token::Rollback);
926    }
927    if eq_ci(b, b"distinct") {
928        return Some(Token::Distinct);
929    }
930    if eq_ci(b, b"interval") {
931        return Some(Token::Interval);
932    }
933    None
934}
935
936#[inline]
937fn kw_len9(b: &[u8]) -> Option<Token> {
938    // 1 keyword: savepoint
939    if eq_ci(b, b"savepoint") {
940        return Some(Token::Savepoint);
941    }
942    None
943}
944
945#[inline]
946fn kw_len10(b: &[u8]) -> Option<Token> {
947    // 1 keyword: connection
948    if eq_ci(b, b"connection") {
949        return Some(Token::Connection);
950    }
951    None
952}
953
954#[inline]
955fn kw_len11(b: &[u8]) -> Option<Token> {
956    // 1 keyword: publication
957    if eq_ci(b, b"publication") {
958        return Some(Token::Publication);
959    }
960    None
961}
962
963#[inline]
964fn kw_len12(b: &[u8]) -> Option<Token> {
965    // 1 keyword: subscription
966    if eq_ci(b, b"subscription") {
967        return Some(Token::Subscription);
968    }
969    None
970}
971
972/// Lex a `'...'` string literal or `"..."` quoted identifier. The opening
973/// quote sits at `input[start]`; `quote` is its byte value. `is_ident` selects
974/// the resulting token shape.
975///
976/// PG-style doubling escapes the quote: `''` inside `'...'` is a literal `'`,
977/// same for `""` inside `"..."`.
978fn lex_quoted(
979    input: &str,
980    start: usize,
981    quote: u8,
982    is_ident: bool,
983) -> Result<(Token, usize), LexError> {
984    let bytes = input.as_bytes();
985    let mut i = start + 1;
986    let mut s = String::new();
987    loop {
988        if i >= bytes.len() {
989            return Err(LexError {
990                kind: if is_ident {
991                    LexErrorKind::UnterminatedQuotedIdent
992                } else {
993                    LexErrorKind::UnterminatedString
994                },
995                pos: start,
996            });
997        }
998        if bytes[i] == quote {
999            if peek_eq(bytes, i + 1, quote) {
1000                s.push(quote as char);
1001                i += 2;
1002            } else {
1003                i += 1;
1004                break;
1005            }
1006        } else {
1007            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1008            s.push(ch);
1009            i += ch.len_utf8();
1010        }
1011    }
1012    let tok = if is_ident {
1013        Token::QuotedIdent(s)
1014    } else {
1015        Token::String(s)
1016    };
1017    Ok((tok, i - start))
1018}
1019
1020/// v7.18 — Lex a PG escape-string literal `E'...'`. `start` points
1021/// at the opening single quote (the `E` was matched by the caller
1022/// and is NOT part of `start`'s offset semantics — the consumed
1023/// count returned excludes the `E`, which the caller adds).
1024///
1025/// Recognised escape sequences:
1026///   \\ \' \" — literal backslash / quote
1027///   \n \r \t \b \f — standard whitespace controls
1028///   \0 — NUL
1029///   \xHH — single hex byte (1–2 hex digits)
1030///   \NNN — octal byte (1–3 octal digits)
1031/// Any other `\X` decodes to the literal byte `X` (PG warns; SPG
1032/// follows the lenient behaviour pg_dump output relies on).
1033///
1034/// Doubled `''` is still a literal `'` (same as the non-E form).
1035fn lex_escape_string(input: &str, start: usize) -> Result<(Token, usize), LexError> {
1036    let bytes = input.as_bytes();
1037    debug_assert_eq!(bytes[start], b'\'');
1038    let mut i = start + 1;
1039    let mut s = String::new();
1040    loop {
1041        if i >= bytes.len() {
1042            return Err(LexError {
1043                kind: LexErrorKind::UnterminatedString,
1044                pos: start,
1045            });
1046        }
1047        let b = bytes[i];
1048        if b == b'\'' {
1049            if peek_eq(bytes, i + 1, b'\'') {
1050                s.push('\'');
1051                i += 2;
1052                continue;
1053            }
1054            i += 1;
1055            break;
1056        }
1057        if b == b'\\' && i + 1 < bytes.len() {
1058            let n = bytes[i + 1];
1059            match n {
1060                b'\\' => {
1061                    s.push('\\');
1062                    i += 2;
1063                }
1064                b'\'' => {
1065                    s.push('\'');
1066                    i += 2;
1067                }
1068                b'"' => {
1069                    s.push('"');
1070                    i += 2;
1071                }
1072                b'n' => {
1073                    s.push('\n');
1074                    i += 2;
1075                }
1076                b'r' => {
1077                    s.push('\r');
1078                    i += 2;
1079                }
1080                b't' => {
1081                    s.push('\t');
1082                    i += 2;
1083                }
1084                b'b' => {
1085                    s.push('\u{0008}');
1086                    i += 2;
1087                }
1088                b'f' => {
1089                    s.push('\u{000C}');
1090                    i += 2;
1091                }
1092                b'0' if i + 2 >= bytes.len() || !bytes[i + 2].is_ascii_digit() => {
1093                    s.push('\0');
1094                    i += 2;
1095                }
1096                b'x' => {
1097                    // \xH or \xHH — single byte by hex.
1098                    let h1 = bytes.get(i + 2).copied();
1099                    let h2 = bytes.get(i + 3).copied();
1100                    let n1 = h1.and_then(hex_digit_value);
1101                    let n2 = h2.and_then(hex_digit_value);
1102                    match (n1, n2) {
1103                        (Some(a), Some(b2)) => {
1104                            s.push((((a << 4) | b2) as u8) as char);
1105                            i += 4;
1106                        }
1107                        (Some(a), _) => {
1108                            s.push((a as u8) as char);
1109                            i += 3;
1110                        }
1111                        _ => {
1112                            // \x with no hex follows — literal x.
1113                            s.push('x');
1114                            i += 2;
1115                        }
1116                    }
1117                }
1118                d if d.is_ascii_digit() && d < b'8' => {
1119                    // \NNN octal — up to 3 octal digits.
1120                    let mut value: u32 = u32::from(d - b'0');
1121                    let mut take = 2;
1122                    while take < 4 {
1123                        let next = bytes.get(i + take).copied();
1124                        match next {
1125                            Some(c) if c.is_ascii_digit() && c < b'8' => {
1126                                value = (value << 3) | u32::from(c - b'0');
1127                                take += 1;
1128                            }
1129                            _ => break,
1130                        }
1131                    }
1132                    if let Some(c) = char::from_u32(value) {
1133                        s.push(c);
1134                    } else {
1135                        // Invalid Unicode — preserve as raw byte char.
1136                        s.push((value & 0xFF) as u8 as char);
1137                    }
1138                    i += take;
1139                }
1140                other => {
1141                    // Lenient fallback — same as PG with
1142                    // `standard_conforming_strings = off` warning:
1143                    // decode `\X` to literal `X`.
1144                    s.push(other as char);
1145                    i += 2;
1146                }
1147            }
1148        } else {
1149            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1150            s.push(ch);
1151            i += ch.len_utf8();
1152        }
1153    }
1154    Ok((Token::String(s), i - start))
1155}
1156
1157fn hex_digit_value(b: u8) -> Option<u32> {
1158    match b {
1159        b'0'..=b'9' => Some(u32::from(b - b'0')),
1160        b'a'..=b'f' => Some(u32::from(b - b'a' + 10)),
1161        b'A'..=b'F' => Some(u32::from(b - b'A' + 10)),
1162        _ => None,
1163    }
1164}
1165
1166fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
1167    let bytes = s.as_bytes();
1168    let mut i = 0usize;
1169    let mut is_float = false;
1170
1171    while i < bytes.len() && bytes[i].is_ascii_digit() {
1172        i += 1;
1173    }
1174    if i < bytes.len() && bytes[i] == b'.' {
1175        is_float = true;
1176        i += 1;
1177        while i < bytes.len() && bytes[i].is_ascii_digit() {
1178            i += 1;
1179        }
1180    }
1181    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
1182        is_float = true;
1183        i += 1;
1184        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
1185            i += 1;
1186        }
1187        let exp_start = i;
1188        while i < bytes.len() && bytes[i].is_ascii_digit() {
1189            i += 1;
1190        }
1191        if exp_start == i {
1192            return Err(LexErrorKind::BadNumber(s[..i].to_string()));
1193        }
1194    }
1195
1196    let lit = &s[..i];
1197    if is_float {
1198        lit.parse::<f64>()
1199            .map(|v| (Token::Float(v), i))
1200            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1201    } else {
1202        lit.parse::<i64>()
1203            .map(|v| (Token::Integer(v), i))
1204            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1205    }
1206}
1207
1208#[cfg(test)]
1209mod tests {
1210    use super::*;
1211    use alloc::vec;
1212
1213    fn lex(s: &str) -> Vec<Token> {
1214        tokenize(s).expect("lex ok")
1215    }
1216
1217    #[test]
1218    fn empty_yields_only_eof() {
1219        assert_eq!(lex(""), vec![Token::Eof]);
1220    }
1221
1222    #[test]
1223    fn whitespace_only_yields_only_eof() {
1224        assert_eq!(lex("   \t\n  "), vec![Token::Eof]);
1225    }
1226
1227    #[test]
1228    fn keywords_are_case_insensitive() {
1229        assert_eq!(
1230            lex("SELECT select Select"),
1231            vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1232        );
1233    }
1234
1235    #[test]
1236    fn identifiers_lowercase_ascii() {
1237        assert_eq!(
1238            lex("hello WORLD _x x1"),
1239            vec![
1240                Token::Ident("hello".into()),
1241                Token::Ident("world".into()),
1242                Token::Ident("_x".into()),
1243                Token::Ident("x1".into()),
1244                Token::Eof,
1245            ]
1246        );
1247    }
1248
1249    #[test]
1250    fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1251        assert_eq!(
1252            lex(r#""User Name" "a""b""#),
1253            vec![
1254                Token::QuotedIdent("User Name".into()),
1255                Token::QuotedIdent("a\"b".into()),
1256                Token::Eof,
1257            ]
1258        );
1259    }
1260
1261    #[test]
1262    fn integer_and_float_literals() {
1263        assert_eq!(
1264            lex("0 42 1.5 .5 1e10 2.5e-3"),
1265            vec![
1266                Token::Integer(0),
1267                Token::Integer(42),
1268                Token::Float(1.5),
1269                Token::Float(0.5),
1270                Token::Float(1e10),
1271                Token::Float(2.5e-3),
1272                Token::Eof,
1273            ]
1274        );
1275    }
1276
1277    #[test]
1278    fn negative_number_is_minus_then_integer() {
1279        // PG follows this: unary minus is a separate token, parser folds it.
1280        assert_eq!(
1281            lex("-42"),
1282            vec![Token::Minus, Token::Integer(42), Token::Eof]
1283        );
1284    }
1285
1286    #[test]
1287    fn string_literal_doubled_quote_escape() {
1288        assert_eq!(
1289            lex("'hello' 'it''s'"),
1290            vec![
1291                Token::String("hello".into()),
1292                Token::String("it's".into()),
1293                Token::Eof,
1294            ]
1295        );
1296    }
1297
1298    #[test]
1299    fn all_comparison_and_arithmetic_operators() {
1300        assert_eq!(
1301            lex("= <> != < <= > >= + - * /"),
1302            vec![
1303                Token::Eq,
1304                Token::NotEq,
1305                Token::NotEq,
1306                Token::Lt,
1307                Token::LtEq,
1308                Token::Gt,
1309                Token::GtEq,
1310                Token::Plus,
1311                Token::Minus,
1312                Token::Star,
1313                Token::Slash,
1314                Token::Eof,
1315            ]
1316        );
1317    }
1318
1319    #[test]
1320    fn punctuation() {
1321        assert_eq!(
1322            lex("( ) , ; ."),
1323            vec![
1324                Token::LParen,
1325                Token::RParen,
1326                Token::Comma,
1327                Token::Semicolon,
1328                Token::Dot,
1329                Token::Eof,
1330            ]
1331        );
1332    }
1333
1334    #[test]
1335    fn line_comment_skipped() {
1336        assert_eq!(
1337            lex("SELECT -- trailing junk\nFROM"),
1338            vec![Token::Select, Token::From, Token::Eof]
1339        );
1340    }
1341
1342    #[test]
1343    fn block_comment_skipped() {
1344        assert_eq!(
1345            lex("SELECT /* skipped */ 1"),
1346            vec![Token::Select, Token::Integer(1), Token::Eof]
1347        );
1348    }
1349
1350    #[test]
1351    fn unterminated_string_errors() {
1352        let err = tokenize("'oops").unwrap_err();
1353        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1354        assert_eq!(err.pos, 0);
1355    }
1356
1357    #[test]
1358    fn unterminated_block_comment_errors() {
1359        let err = tokenize("/* never closed").unwrap_err();
1360        assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1361    }
1362
1363    #[test]
1364    fn unknown_char_errors() {
1365        // v7.17.0 Phase 2.6 — `@` standalone now lexes as
1366        // Token::At (mysqldump `'user'@'host'` DEFINER stitching).
1367        // Use `?` for the unknown-char regression; PG `?` operator
1368        // family is parsed as JSON ops in the prefix `?` shape
1369        // would land in lex paths; bare `?` is unknown.
1370        let err = tokenize("\x07").unwrap_err();
1371        assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1372    }
1373
1374    #[test]
1375    fn at_alone_lexes_as_punctuation() {
1376        // v7.17.0 Phase 2.6 — the `'user'@'host'` MySQL DEFINER
1377        // form needs `@` to lex as a standalone token.
1378        assert_eq!(
1379            lex("'u'@'h'"),
1380            vec![
1381                Token::String("u".into()),
1382                Token::At,
1383                Token::String("h".into()),
1384                Token::Eof,
1385            ]
1386        );
1387    }
1388
1389    #[test]
1390    fn dot_in_qualified_column() {
1391        assert_eq!(
1392            lex("t.col"),
1393            vec![
1394                Token::Ident("t".into()),
1395                Token::Dot,
1396                Token::Ident("col".into()),
1397                Token::Eof,
1398            ]
1399        );
1400    }
1401
1402    // --- v0.11 brackets + distance op + vector keyword --------------------
1403
1404    #[test]
1405    fn brackets_are_distinct_tokens() {
1406        assert_eq!(
1407            lex("[ ]"),
1408            vec![Token::LBracket, Token::RBracket, Token::Eof]
1409        );
1410    }
1411
1412    #[test]
1413    fn l2_distance_is_three_char_token() {
1414        assert_eq!(
1415            lex("a <-> b"),
1416            vec![
1417                Token::Ident("a".into()),
1418                Token::L2Distance,
1419                Token::Ident("b".into()),
1420                Token::Eof,
1421            ]
1422        );
1423        // Bare `<-` should NOT match L2Distance.
1424        assert_eq!(
1425            lex("a <- b"),
1426            vec![
1427                Token::Ident("a".into()),
1428                Token::Lt,
1429                Token::Minus,
1430                Token::Ident("b".into()),
1431                Token::Eof,
1432            ]
1433        );
1434    }
1435
1436    #[test]
1437    fn order_by_limit_are_keywords() {
1438        assert_eq!(
1439            lex("ORDER BY LIMIT"),
1440            vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1441        );
1442    }
1443
1444    // --- v1.2: pgvector distance ops + PG cast --------------------------
1445
1446    #[test]
1447    fn inner_product_operator_3char() {
1448        assert_eq!(
1449            lex("a <#> b"),
1450            vec![
1451                Token::Ident("a".into()),
1452                Token::InnerProduct,
1453                Token::Ident("b".into()),
1454                Token::Eof,
1455            ]
1456        );
1457    }
1458
1459    #[test]
1460    fn cosine_distance_operator_3char() {
1461        assert_eq!(
1462            lex("a <=> b"),
1463            vec![
1464                Token::Ident("a".into()),
1465                Token::CosineDistance,
1466                Token::Ident("b".into()),
1467                Token::Eof,
1468            ]
1469        );
1470        // Make sure `<=` and `<>` and `<->` still lex right when `<=>` is
1471        // around (greedy match takes the longest).
1472        assert_eq!(
1473            lex("a <= b"),
1474            vec![
1475                Token::Ident("a".into()),
1476                Token::LtEq,
1477                Token::Ident("b".into()),
1478                Token::Eof,
1479            ]
1480        );
1481    }
1482
1483    #[test]
1484    fn double_colon_cast_token() {
1485        assert_eq!(
1486            lex("x::INT"),
1487            vec![
1488                Token::Ident("x".into()),
1489                Token::DoubleColon,
1490                Token::Ident("int".into()),
1491                Token::Eof,
1492            ]
1493        );
1494    }
1495
1496    #[test]
1497    fn lone_single_colon_lexes_as_colon_token() {
1498        // v7.12.4 — single `:` is now a token (PL/pgSQL surface
1499        // + tsvector external-form literal both need it). The
1500        // pre-v7.12.4 "single colon = unknown char" behaviour
1501        // was incidental.
1502        let toks = tokenize(":x").expect("colon now lexes");
1503        assert_eq!(toks[0], Token::Colon);
1504    }
1505
1506    #[test]
1507    fn colon_eq_lexes_as_assignment() {
1508        // v7.12.4 — PL/pgSQL assignment operator.
1509        let toks = tokenize("x := 1").expect("colon-eq lexes");
1510        // Tokens: Ident("x"), ColonEq, NumberLiteral
1511        assert!(matches!(toks[1], Token::ColonEq));
1512    }
1513
1514    #[test]
1515    fn pg_escape_string_double_backslash_decodes_to_single() {
1516        // v7.18 — E'\\xdeadbeef' decodes to literal `\xdeadbeef`
1517        // (10 chars: backslash + xdeadbeef). The downstream
1518        // `::bytea` cast then reads that as the PG hex-form bytea
1519        // literal. mailrs D-pre #3.
1520        let toks = tokenize(r"E'\\xdeadbeef'").expect("E-string lexes");
1521        assert_eq!(toks, vec![Token::String(r"\xdeadbeef".into()), Token::Eof]);
1522    }
1523
1524    #[test]
1525    fn pg_escape_string_supports_basic_escapes() {
1526        // \n / \t / \' / \\ — the PG standard set.
1527        let toks = tokenize(r"E'a\nb\tc\'d\\e'").expect("E-string lexes");
1528        assert_eq!(toks, vec![Token::String("a\nb\tc'd\\e".into()), Token::Eof]);
1529    }
1530
1531    #[test]
1532    fn pg_escape_string_hex_byte() {
1533        // \xHH single byte. \x41 = 'A'.
1534        let toks = tokenize(r"E'\x41B\x42'").expect("E-string lexes");
1535        assert_eq!(toks, vec![Token::String("ABB".into()), Token::Eof]);
1536    }
1537
1538    #[test]
1539    fn pg_escape_string_lowercase_e_prefix() {
1540        let toks = tokenize(r"e'hi\n'").expect("e-string lexes");
1541        assert_eq!(toks, vec![Token::String("hi\n".into()), Token::Eof]);
1542    }
1543
1544    #[test]
1545    fn pg_escape_string_doubled_quote() {
1546        // Even in E-string the doubled '' is a literal '.
1547        let toks = tokenize(r"E'it''s ok'").expect("E-string lexes");
1548        assert_eq!(toks, vec![Token::String("it's ok".into()), Token::Eof]);
1549    }
1550}
spg_sql/lexer.rs

spg_sql/
lexer.rs