spg_sql/
lexer.rs

1//! Lexer for the PG-dialect subset that SPG accepts.
2//!
3//! v0.2 token stream is value-only — no source spans yet. Errors do report
4//! the byte offset where the offending construct started. Identifiers are
5//! ASCII case-folded to lower-case (matches PG when un-quoted). Quoted
6//! identifiers (`"..."`) preserve case; `""` is an embedded quote.
7//! String literals (`'...'`) follow PG single-quote convention with `''`
8//! as the embedded quote. The lexer accepts but does not interpret E-strings
9//! or dollar-quoted strings — those land in a later milestone.
10
11use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords
18    Select,
19    From,
20    Where,
21    As,
22    Null,
23    True,
24    False,
25    And,
26    Or,
27    Not,
28    Create,
29    Table,
30    Insert,
31    Into,
32    Values,
33    Index,
34    On,
35    Begin,
36    Commit,
37    Rollback,
38    Order,
39    By,
40    Limit,
41
42    // Identifiers
43    Ident(String),       // ASCII case-folded
44    QuotedIdent(String), // original case, "" → "
45    /// v7.14.0 — MySQL session / user variable reference
46    /// (`@VAR` / `@@VAR`). The wrapped string is the verbatim
47    /// source form (including the `@` / `@@` prefix). Used by
48    /// mysqldump preamble (`SET @OLD_FOREIGN_KEY_CHECKS =
49    /// @@FOREIGN_KEY_CHECKS, …`); SPG accepts the token and
50    /// the SET parser treats the assignment as a no-op apart
51    /// from any second LHS that targets a real session
52    /// parameter (e.g. `FOREIGN_KEY_CHECKS=0`).
53    SessionVar(String),
54
55    // Literals
56    Integer(i64),
57    Float(f64),
58    String(String),
59
60    // Operators
61    Plus,
62    Minus,
63    Star,
64    Slash,
65    Eq,
66    NotEq,
67    Lt,
68    LtEq,
69    Gt,
70    GtEq,
71    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contained-in
72    /// `<<`. LHS is strictly inside RHS (no equality).
73    InetContainedBy,
74    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contained-in-or-equal
75    /// `<<=`. LHS network ⊆ RHS network.
76    InetContainedByEq,
77    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contains `>>`.
78    /// LHS strictly contains RHS.
79    InetContains,
80    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contains-or-equal `>>=`.
81    /// LHS network ⊇ RHS network.
82    InetContainsEq,
83    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR network overlap `&&`.
84    /// Either side contains any address of the other.
85    InetOverlap,
86
87    // Punctuation
88    LParen,
89    RParen,
90    LBracket,
91    RBracket,
92    Comma,
93    Semicolon,
94    Dot,
95    /// v7.17.0 Phase 2.6 — standalone `@` punctuation. Emitted when
96    /// `@` is NOT followed by an ident-start byte (i.e. the
97    /// `@VAR` / `@@VAR` SessionVar path doesn't match). Lets the
98    /// parser stitch the MySQL `'user'@'host'` DEFINER form back
99    /// together as String + At + String. Pre-2.6 this same shape
100    /// surfaced as a `LexErrorKind::UnknownChar('@')` and broke
101    /// every mysqldump CREATE VIEW with a DEFINER clause at lex
102    /// time.
103    At,
104    /// pgvector L2 distance operator `<->`. Lexed as one token so the
105    /// parser can give it its own precedence rung.
106    /// v4.14 `->` — JSON object/array element access, returns json.
107    JsonGet,
108    /// v4.14 `->>` — same access, returns text.
109    JsonGetText,
110    /// v6.4.5 `#>` — JSON path walk, returns json. Path is the
111    /// right-hand TEXT with PG `{a,b,0}` syntax.
112    JsonGetPath,
113    /// v6.4.5 `#>>` — same walk, returns text.
114    JsonGetPathText,
115    /// v6.4.5 `@>` — JSON containment. `j @> sub` returns true if
116    /// every key/value in `sub` is present in `j` with structural
117    /// containment for objects + arrays.
118    JsonContains,
119    /// v7.12.2 `@@` — tsvector / tsquery match. Either ordering
120    /// (`vec @@ q` or `q @@ vec`) parses; engine eval normalises
121    /// before matching.
122    TsMatch,
123    L2Distance,
124    /// pgvector inner-product operator `<#>` (returns negative dot product
125    /// so smaller still means more similar — same semantics as pgvector).
126    InnerProduct,
127    /// pgvector cosine distance operator `<=>`.
128    CosineDistance,
129    /// PG-style cast `expr::type` — single token because we want it to bind
130    /// at postfix precedence.
131    DoubleColon,
132    /// v7.12.4 — PL/pgSQL assignment operator `:=`.
133    /// Outside PL/pgSQL bodies this token has no SQL-side meaning.
134    ColonEq,
135    /// v7.12.4 — bare `:` separator. Used inside `tsvector` external-form
136    /// literals (`'cat:1 dog:2'::tsvector`) and as the fallback path for
137    /// the PL/pgSQL assignment lexer.
138    Colon,
139    /// Standard SQL string concatenation `||`.
140    Concat,
141    /// `IS` keyword — postfix `IS NULL` / `IS NOT NULL` predicates.
142    Is,
143    Between,
144    In,
145    Like,
146    Group,
147    Distinct,
148    Union,
149    All,
150    Join,
151    Inner,
152    Left,
153    Cross,
154    Outer,
155    Default,
156    Savepoint,
157    Release,
158    To,
159    Having,
160    Show,
161    Extract,
162    Offset,
163    Asc,
164    Desc,
165    /// `INTERVAL` — followed by a string literal carrying the span text
166    /// (e.g. `INTERVAL '1 day 2 hours'`).
167    Interval,
168    /// v6.1.1 — `$N` parameter placeholder for the extended query
169    /// protocol. The number N is 1-based per PostgreSQL convention.
170    /// `0` and `$0` are not valid; the lexer rejects them.
171    Placeholder(u16),
172
173    /// v6.1.2 — `DROP` keyword. Used by `DROP PUBLICATION <name>`.
174    /// Reserved for future `DROP TABLE` / `DROP INDEX` / `DROP USER`
175    /// surface that currently goes through SHOW-shaped admin SQL.
176    Drop,
177    /// v6.1.2 — `FOR` keyword (publication scope).
178    For,
179    /// v6.1.2 — `TABLES` plural keyword (`FOR ALL TABLES`,
180    /// `FOR ALL TABLES EXCEPT …`). The existing `TABLE` keyword
181    /// stays a separate token so `CREATE TABLE`'s single-table
182    /// form keeps lexing as today.
183    Tables,
184    /// v6.1.3 (reserved at v6.1.2 to keep the AST shape stable) —
185    /// `EXCEPT` keyword for `FOR ALL TABLES EXCEPT t1, t2`.
186    Except,
187    /// v6.1.2 — `PUBLICATION` keyword.
188    Publication,
189    /// v6.1.4 (reserved at v6.1.2) — `SUBSCRIPTION` keyword.
190    Subscription,
191    /// v6.1.4 — `CONNECTION` keyword (for
192    /// `CREATE SUBSCRIPTION … CONNECTION '<conn_str>' …`).
193    Connection,
194
195    Eof,
196}
197
198#[derive(Debug, Clone, PartialEq, Eq)]
199pub enum LexErrorKind {
200    UnknownChar(char),
201    UnterminatedString,
202    UnterminatedQuotedIdent,
203    UnterminatedBlockComment,
204    BadNumber(String),
205}
206
207#[derive(Debug, Clone, PartialEq, Eq)]
208pub struct LexError {
209    pub kind: LexErrorKind,
210    pub pos: usize,
211}
212
213impl fmt::Display for LexError {
214    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
215        match &self.kind {
216            LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
217            LexErrorKind::UnterminatedString => {
218                write!(f, "unterminated string literal at byte {}", self.pos)
219            }
220            LexErrorKind::UnterminatedQuotedIdent => {
221                write!(f, "unterminated quoted identifier at byte {}", self.pos)
222            }
223            LexErrorKind::UnterminatedBlockComment => {
224                write!(f, "unterminated /* */ comment at byte {}", self.pos)
225            }
226            LexErrorKind::BadNumber(s) => {
227                write!(f, "invalid number literal {s:?} at byte {}", self.pos)
228            }
229        }
230    }
231}
232
233/// Tokenize `input` into a `Vec<Token>` ending in `Token::Eof`.
234#[allow(clippy::too_many_lines)] // big match — splitting would obscure the dispatch table
235pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
236    let bytes = input.as_bytes();
237    let mut i = 0usize;
238    let mut out = Vec::new();
239
240    while i < bytes.len() {
241        let b = bytes[i];
242        match b {
243            b' ' | b'\t' | b'\n' | b'\r' => {
244                i += 1;
245            }
246            b'-' if peek_eq(bytes, i + 1, b'-') => {
247                i += 2;
248                while i < bytes.len() && bytes[i] != b'\n' {
249                    i += 1;
250                }
251            }
252            b'/' if peek_eq(bytes, i + 1, b'*') => {
253                let start = i;
254                // v7.14.0 — MySQL versioned conditional comment
255                // `/*!NNNNN <body> */`. The body is real SQL that
256                // MySQL/MariaDB executes when the runtime version
257                // matches the 5-digit code; PG strips the whole
258                // thing as a block comment. SPG sides with MySQL
259                // semantics for dump compatibility: skip the
260                // `/*!NNNNN ` prefix and continue lexing the body
261                // as ordinary tokens. The closing `*/` is later
262                // matched + skipped by the symmetric arm below.
263                if peek_eq(bytes, i + 2, b'!') {
264                    let mut j = i + 3;
265                    // skip the optional 5-digit version code +
266                    // following single whitespace
267                    while j < bytes.len() && bytes[j].is_ascii_digit() {
268                        j += 1;
269                    }
270                    if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
271                        j += 1;
272                    }
273                    i = j;
274                    continue;
275                }
276                i += 2;
277                let mut closed = false;
278                while i + 1 < bytes.len() {
279                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
280                        i += 2;
281                        closed = true;
282                        break;
283                    }
284                    i += 1;
285                }
286                if !closed {
287                    return Err(LexError {
288                        kind: LexErrorKind::UnterminatedBlockComment,
289                        pos: start,
290                    });
291                }
292            }
293            // v7.14.0 — bare `*/` (closing of the v7.14 MySQL
294            // versioned-comment opener that didn't consume the
295            // closer). We treat it as an inline comment terminator
296            // and skip 2 bytes.
297            b'*' if peek_eq(bytes, i + 1, b'/') => {
298                i += 2;
299            }
300            b'\'' => {
301                let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
302                out.push(tok);
303                i += consumed;
304            }
305            // v7.18 — PG escape-string literal `E'...'` / `e'...'`.
306            // Closes the mailrs D-pre #3 reverse-acceptance gap:
307            // `INSERT INTO oq VALUES (E'\\xdeadbeef'::bytea)` needs
308            // the `E` prefix so `\\` decodes to a single `\`. The
309            // produced Token::String carries the decoded body so
310            // downstream parser / cast paths treat it identically
311            // to a regular string literal.
312            b'E' | b'e' if peek_eq(bytes, i + 1, b'\'') => {
313                let (tok, consumed) = lex_escape_string(input, i + 1)?;
314                out.push(tok);
315                i += 1 + consumed;
316            }
317            b'"' => {
318                let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
319                out.push(tok);
320                i += consumed;
321            }
322            // MySQL-flavoured backtick-quoted identifier. Same semantics
323            // as the standard `"..."` form, including embedded "``" as
324            // a literal backtick.
325            b'`' => {
326                let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
327                out.push(tok);
328                i += consumed;
329            }
330            b if b.is_ascii_alphabetic() || b == b'_' => {
331                let start = i;
332                i += 1;
333                while i < bytes.len() {
334                    let c = bytes[i];
335                    if c.is_ascii_alphanumeric() || c == b'_' {
336                        i += 1;
337                    } else {
338                        break;
339                    }
340                }
341                let raw = &input[start..i];
342                // v3.0.5: try the keyword table case-insensitively
343                // without allocating; only the ident fall-through
344                // pays for a lowercase String.
345                out.push(keyword_or_ident_raw(raw));
346            }
347            b if b.is_ascii_digit() => {
348                let (tok, consumed) =
349                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
350                out.push(tok);
351                i += consumed;
352            }
353            b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
354                let (tok, consumed) =
355                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
356                out.push(tok);
357                i += consumed;
358            }
359            b'+' => single(&mut out, Token::Plus, &mut i),
360            b'-' => {
361                // v4.14: `->>` and `->` for JSON path access. `->>`
362                // must be tried before `->` (longest match).
363                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
364                    out.push(Token::JsonGetText);
365                    i += 3;
366                } else if peek_eq(bytes, i + 1, b'>') {
367                    out.push(Token::JsonGet);
368                    i += 2;
369                } else {
370                    single(&mut out, Token::Minus, &mut i);
371                }
372            }
373            // v6.4.5: `#>>` and `#>` JSON path walk.
374            b'#' => {
375                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
376                    out.push(Token::JsonGetPathText);
377                    i += 3;
378                } else if peek_eq(bytes, i + 1, b'>') {
379                    out.push(Token::JsonGetPath);
380                    i += 2;
381                } else {
382                    return Err(LexError {
383                        kind: LexErrorKind::UnknownChar('#'),
384                        pos: i,
385                    });
386                }
387            }
388            // v6.4.5: `@>` JSON containment.
389            // v7.12.2: `@@` tsvector / tsquery match.
390            // v7.14.0: `@@NAME` MySQL session variable ref +
391            //          `@NAME` user variable ref. mysqldump preamble
392            //          uses both heavily (`SET @OLD_FOREIGN_KEY_CHECKS
393            //          = @@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0`).
394            //          We lex both as a single SessionVar token so
395            //          the parser can accept and ignore them.
396            b'@' => {
397                if peek_eq(bytes, i + 1, b'>') {
398                    out.push(Token::JsonContains);
399                    i += 2;
400                } else if peek_eq(bytes, i + 1, b'@')
401                    && !is_session_var_ident_start(bytes.get(i + 2).copied())
402                {
403                    // `@@` not followed by an ident-start byte is
404                    // the tsquery `@@` operator.
405                    out.push(Token::TsMatch);
406                    i += 2;
407                } else {
408                    // `@VAR` / `@@VAR` — MySQL user / session
409                    // variable reference. Consume the ident-shaped
410                    // tail and emit as Token::SessionVar so the
411                    // SET parser can accept-and-ignore.
412                    let prefix_end = if peek_eq(bytes, i + 1, b'@') {
413                        i + 2
414                    } else {
415                        i + 1
416                    };
417                    let mut end = prefix_end;
418                    while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
419                        end += 1;
420                    }
421                    if end == prefix_end {
422                        // v7.17.0 Phase 2.6 — `@` not followed by an
423                        // ident-shaped tail. mysqldump's DEFINER
424                        // form `'user'@'host'` lands here (next
425                        // byte is `'`). Emit as Token::At so the
426                        // parser can stitch the surrounding String
427                        // tokens. Single `@@` already short-circuits
428                        // to Token::TsMatch above, so this only
429                        // fires for a true lone `@`.
430                        out.push(Token::At);
431                        i = prefix_end;
432                        continue;
433                    }
434                    out.push(Token::SessionVar(input[i..end].to_string()));
435                    i = end;
436                }
437            }
438            b'*' => single(&mut out, Token::Star, &mut i),
439            b'/' => single(&mut out, Token::Slash, &mut i),
440            b'(' => single(&mut out, Token::LParen, &mut i),
441            b')' => single(&mut out, Token::RParen, &mut i),
442            b'[' => single(&mut out, Token::LBracket, &mut i),
443            b']' => single(&mut out, Token::RBracket, &mut i),
444            b',' => single(&mut out, Token::Comma, &mut i),
445            b';' => single(&mut out, Token::Semicolon, &mut i),
446            b'.' => single(&mut out, Token::Dot, &mut i),
447            b'=' => single(&mut out, Token::Eq, &mut i),
448            b'<' => {
449                if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
450                    out.push(Token::CosineDistance);
451                    i += 3;
452                } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
453                    out.push(Token::InnerProduct);
454                    i += 3;
455                } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
456                    out.push(Token::L2Distance);
457                    i += 3;
458                } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
459                    // v7.17.0 Phase 3.P0-47 — PG INET `<<=` contained-or-equal.
460                    out.push(Token::InetContainedByEq);
461                    i += 3;
462                } else if peek_eq(bytes, i + 1, b'<') {
463                    // v7.17.0 Phase 3.P0-47 — PG INET `<<` strict contained.
464                    out.push(Token::InetContainedBy);
465                    i += 2;
466                } else if peek_eq(bytes, i + 1, b'=') {
467                    out.push(Token::LtEq);
468                    i += 2;
469                } else if peek_eq(bytes, i + 1, b'>') {
470                    out.push(Token::NotEq);
471                    i += 2;
472                } else {
473                    out.push(Token::Lt);
474                    i += 1;
475                }
476            }
477            b':' if peek_eq(bytes, i + 1, b':') => {
478                out.push(Token::DoubleColon);
479                i += 2;
480            }
481            b':' if peek_eq(bytes, i + 1, b'=') => {
482                // v7.12.4 — PL/pgSQL assignment operator `:=`.
483                out.push(Token::ColonEq);
484                i += 2;
485            }
486            b':' => {
487                // v7.12.4 — bare `:`. Used inside `tsvector` external-form
488                // literals which the cast parser consumes in-token, and as a
489                // separator the PL/pgSQL assignment lexer can recover from.
490                out.push(Token::Colon);
491                i += 1;
492            }
493            b'|' if peek_eq(bytes, i + 1, b'|') => {
494                out.push(Token::Concat);
495                i += 2;
496            }
497            b'>' => {
498                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
499                    // v7.17.0 Phase 3.P0-47 — PG INET `>>=` contains-or-equal.
500                    out.push(Token::InetContainsEq);
501                    i += 3;
502                } else if peek_eq(bytes, i + 1, b'>') {
503                    // v7.17.0 Phase 3.P0-47 — PG INET `>>` strict contains.
504                    out.push(Token::InetContains);
505                    i += 2;
506                } else if peek_eq(bytes, i + 1, b'=') {
507                    out.push(Token::GtEq);
508                    i += 2;
509                } else {
510                    out.push(Token::Gt);
511                    i += 1;
512                }
513            }
514            b'&' if peek_eq(bytes, i + 1, b'&') => {
515                // v7.17.0 Phase 3.P0-47 — PG INET network overlap `&&`.
516                out.push(Token::InetOverlap);
517                i += 2;
518            }
519            b'!' if peek_eq(bytes, i + 1, b'=') => {
520                out.push(Token::NotEq);
521                i += 2;
522            }
523            // v7.9.27 — PG dollar-quoted string `$$ … $$` (or
524            // `$tag$ … $tag$`). Used in `DO $$ … $$ LANGUAGE
525            // plpgsql;` blocks that pg_dump emits for idempotent
526            // migrations. SPG has no PL/pgSQL, so the lexer
527            // consumes the entire string as a single Token::String
528            // and the parser treats the surrounding `DO …;` as a
529            // no-op. mailrs follow-up H1.
530            b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
531                // Empty tag form: `$$ … $$`.
532                let end = find_dollar_tag_end(bytes, i + 2, b"$$");
533                let body = match end {
534                    Some(e) => &input[i + 2..e],
535                    None => {
536                        return Err(LexError {
537                            kind: LexErrorKind::UnterminatedString,
538                            pos: i,
539                        });
540                    }
541                };
542                out.push(Token::String(body.to_string()));
543                i = end.unwrap() + 2;
544            }
545            b'$' if i + 1 < bytes.len()
546                && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
547            {
548                // Tagged form: `$foo$ … $foo$`. Scan the tag
549                // ident, find the closing copy.
550                let mut j = i + 1;
551                while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
552                    j += 1;
553                }
554                if j >= bytes.len() || bytes[j] != b'$' {
555                    // Not a dollar-quoted string — fall through
556                    // to the generic-unknown-char path.
557                    let ch = input[i..].chars().next().unwrap_or('?');
558                    return Err(LexError {
559                        kind: LexErrorKind::UnknownChar(ch),
560                        pos: i,
561                    });
562                }
563                let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
564                let end = find_dollar_tag_end(bytes, j + 1, &close);
565                let body = match end {
566                    Some(e) => &input[j + 1..e],
567                    None => {
568                        return Err(LexError {
569                            kind: LexErrorKind::UnterminatedString,
570                            pos: i,
571                        });
572                    }
573                };
574                out.push(Token::String(body.to_string()));
575                i = end.unwrap() + close.len();
576            }
577            // v6.1.1: `$N` parameter placeholder for the extended
578            // query protocol. PG numbers them 1..=N; we reject $0
579            // and a bare `$` not followed by a digit.
580            b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
581                let mut j = i + 1;
582                let mut n: u32 = 0;
583                while j < bytes.len() && bytes[j].is_ascii_digit() {
584                    n = n
585                        .saturating_mul(10)
586                        .saturating_add(u32::from(bytes[j] - b'0'));
587                    j += 1;
588                }
589                if n == 0 || n > u32::from(u16::MAX) {
590                    return Err(LexError {
591                        kind: LexErrorKind::BadNumber(input[i..j].to_string()),
592                        pos: i,
593                    });
594                }
595                #[allow(clippy::cast_possible_truncation)]
596                out.push(Token::Placeholder(n as u16));
597                i = j;
598            }
599            _ => {
600                let ch = input[i..].chars().next().unwrap_or('?');
601                return Err(LexError {
602                    kind: LexErrorKind::UnknownChar(ch),
603                    pos: i,
604                });
605            }
606        }
607    }
608    out.push(Token::Eof);
609    Ok(out)
610}
611
612fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
613    bytes.get(i) == Some(&target)
614}
615
616/// v7.14.0 — recognise the first byte of a MySQL session/user
617/// variable name (after `@` or `@@`). PG-strict idents are ASCII
618/// letter or underscore; MySQL also allows leading digits inside
619/// quoted names but unquoted vars match the same shape.
620fn is_session_var_ident_start(b: Option<u8>) -> bool {
621    matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
622}
623
624/// Continuation byte for a `@VAR`/`@@VAR` ident (after the first
625/// alphabet/underscore byte). Letters, digits, underscore, dot
626/// (MySQL allows session-scope qualifiers like
627/// `@@global.sql_mode`) and `$` (some MySQL versions accept it).
628fn is_session_var_ident_continue(b: u8) -> bool {
629    b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
630}
631
632/// v7.9.27 — find the start index of the next occurrence of `tag`
633/// (e.g. `b"$$"` or `b"$foo$"`) in `bytes` starting at `from`.
634fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
635    if tag.is_empty() || from > bytes.len() {
636        return None;
637    }
638    let mut i = from;
639    while i + tag.len() <= bytes.len() {
640        if &bytes[i..i + tag.len()] == tag {
641            return Some(i);
642        }
643        i += 1;
644    }
645    None
646}
647
648fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
649    bytes.get(i).is_some_and(pred)
650}
651
652fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
653    out.push(tok);
654    *i += 1;
655}
656
657/// Length-first ASCII-CI keyword lookup. Avoids allocating a
658/// lowercase `String` when the input matches a keyword; only the ident
659/// fall-through path pays for the lowercase copy.
660///
661/// Grouped by length so the outer `match` becomes a small jump table.
662/// Within a length bucket every keyword has either a unique first
663/// byte (cheap dispatch) or a small set of disambiguating
664/// trailing-byte comparisons. All comparisons are ASCII-CI (XOR
665/// 0x20 on each byte before the compare).
666fn keyword_or_ident_raw(raw: &str) -> Token {
667    let b = raw.as_bytes();
668    let tok = match b.len() {
669        2 => kw_len2(b),
670        3 => kw_len3(b),
671        4 => kw_len4(b),
672        5 => kw_len5(b),
673        6 => kw_len6(b),
674        7 => kw_len7(b),
675        8 => kw_len8(b),
676        9 => kw_len9(b),
677        10 => kw_len10(b),
678        11 => kw_len11(b),
679        12 => kw_len12(b),
680        _ => None,
681    };
682    match tok {
683        Some(t) => t,
684        // Ident fall-through: this is the only path that allocates.
685        None => Token::Ident(raw.to_ascii_lowercase()),
686    }
687}
688
689/// ASCII-CI equality on a byte slice against a lowercase literal.
690/// Letters that differ only in case satisfy `(a ^ b) == 0x20`; other
691/// mismatches set bits outside the 0x20 mask. We compare each byte
692/// against its lowercase form via `to_ascii_lowercase` for clarity;
693/// the compiler folds the loop into a tight cmov chain.
694#[inline]
695fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
696    if input.len() != lower.len() {
697        return false;
698    }
699    for i in 0..lower.len() {
700        if input[i].to_ascii_lowercase() != lower[i] {
701            return false;
702        }
703    }
704    true
705}
706
707#[inline]
708fn kw_len2(b: &[u8]) -> Option<Token> {
709    // 7 keywords: as, by, in, is, on, or, to
710    if eq_ci(b, b"as") {
711        return Some(Token::As);
712    }
713    if eq_ci(b, b"by") {
714        return Some(Token::By);
715    }
716    if eq_ci(b, b"in") {
717        return Some(Token::In);
718    }
719    if eq_ci(b, b"is") {
720        return Some(Token::Is);
721    }
722    if eq_ci(b, b"on") {
723        return Some(Token::On);
724    }
725    if eq_ci(b, b"or") {
726        return Some(Token::Or);
727    }
728    if eq_ci(b, b"to") {
729        return Some(Token::To);
730    }
731    None
732}
733
734#[inline]
735fn kw_len3(b: &[u8]) -> Option<Token> {
736    // 5 keywords: all, and, asc, not, for
737    if eq_ci(b, b"for") {
738        return Some(Token::For);
739    }
740    if eq_ci(b, b"all") {
741        return Some(Token::All);
742    }
743    if eq_ci(b, b"and") {
744        return Some(Token::And);
745    }
746    if eq_ci(b, b"asc") {
747        return Some(Token::Asc);
748    }
749    if eq_ci(b, b"not") {
750        return Some(Token::Not);
751    }
752    None
753}
754
755#[inline]
756fn kw_len4(b: &[u8]) -> Option<Token> {
757    // 10 keywords: from, null, true, into, like, join, left, show, desc, drop
758    if eq_ci(b, b"from") {
759        return Some(Token::From);
760    }
761    if eq_ci(b, b"drop") {
762        return Some(Token::Drop);
763    }
764    if eq_ci(b, b"null") {
765        return Some(Token::Null);
766    }
767    if eq_ci(b, b"true") {
768        return Some(Token::True);
769    }
770    if eq_ci(b, b"into") {
771        return Some(Token::Into);
772    }
773    if eq_ci(b, b"like") {
774        return Some(Token::Like);
775    }
776    if eq_ci(b, b"join") {
777        return Some(Token::Join);
778    }
779    if eq_ci(b, b"left") {
780        return Some(Token::Left);
781    }
782    if eq_ci(b, b"show") {
783        return Some(Token::Show);
784    }
785    if eq_ci(b, b"desc") {
786        return Some(Token::Desc);
787    }
788    None
789}
790
791#[inline]
792fn kw_len5(b: &[u8]) -> Option<Token> {
793    // 12 keywords: false, where, table, index, begin, order, limit,
794    // group, union, inner, cross, outer
795    if eq_ci(b, b"false") {
796        return Some(Token::False);
797    }
798    if eq_ci(b, b"where") {
799        return Some(Token::Where);
800    }
801    if eq_ci(b, b"table") {
802        return Some(Token::Table);
803    }
804    if eq_ci(b, b"index") {
805        return Some(Token::Index);
806    }
807    if eq_ci(b, b"begin") {
808        return Some(Token::Begin);
809    }
810    if eq_ci(b, b"order") {
811        return Some(Token::Order);
812    }
813    if eq_ci(b, b"limit") {
814        return Some(Token::Limit);
815    }
816    if eq_ci(b, b"group") {
817        return Some(Token::Group);
818    }
819    if eq_ci(b, b"union") {
820        return Some(Token::Union);
821    }
822    if eq_ci(b, b"inner") {
823        return Some(Token::Inner);
824    }
825    if eq_ci(b, b"cross") {
826        return Some(Token::Cross);
827    }
828    if eq_ci(b, b"outer") {
829        return Some(Token::Outer);
830    }
831    None
832}
833
834#[inline]
835fn kw_len6(b: &[u8]) -> Option<Token> {
836    // 9 keywords: select, create, insert, values, commit, having, offset, tables, except
837    if eq_ci(b, b"select") {
838        return Some(Token::Select);
839    }
840    if eq_ci(b, b"tables") {
841        return Some(Token::Tables);
842    }
843    if eq_ci(b, b"except") {
844        return Some(Token::Except);
845    }
846    if eq_ci(b, b"create") {
847        return Some(Token::Create);
848    }
849    if eq_ci(b, b"insert") {
850        return Some(Token::Insert);
851    }
852    if eq_ci(b, b"values") {
853        return Some(Token::Values);
854    }
855    if eq_ci(b, b"commit") {
856        return Some(Token::Commit);
857    }
858    if eq_ci(b, b"having") {
859        return Some(Token::Having);
860    }
861    if eq_ci(b, b"offset") {
862        return Some(Token::Offset);
863    }
864    None
865}
866
867#[inline]
868fn kw_len7(b: &[u8]) -> Option<Token> {
869    // 4 keywords: between, default, release, extract
870    if eq_ci(b, b"between") {
871        return Some(Token::Between);
872    }
873    if eq_ci(b, b"default") {
874        return Some(Token::Default);
875    }
876    if eq_ci(b, b"release") {
877        return Some(Token::Release);
878    }
879    if eq_ci(b, b"extract") {
880        return Some(Token::Extract);
881    }
882    None
883}
884
885#[inline]
886fn kw_len8(b: &[u8]) -> Option<Token> {
887    // 3 keywords: rollback, distinct, interval
888    if eq_ci(b, b"rollback") {
889        return Some(Token::Rollback);
890    }
891    if eq_ci(b, b"distinct") {
892        return Some(Token::Distinct);
893    }
894    if eq_ci(b, b"interval") {
895        return Some(Token::Interval);
896    }
897    None
898}
899
900#[inline]
901fn kw_len9(b: &[u8]) -> Option<Token> {
902    // 1 keyword: savepoint
903    if eq_ci(b, b"savepoint") {
904        return Some(Token::Savepoint);
905    }
906    None
907}
908
909#[inline]
910fn kw_len10(b: &[u8]) -> Option<Token> {
911    // 1 keyword: connection
912    if eq_ci(b, b"connection") {
913        return Some(Token::Connection);
914    }
915    None
916}
917
918#[inline]
919fn kw_len11(b: &[u8]) -> Option<Token> {
920    // 1 keyword: publication
921    if eq_ci(b, b"publication") {
922        return Some(Token::Publication);
923    }
924    None
925}
926
927#[inline]
928fn kw_len12(b: &[u8]) -> Option<Token> {
929    // 1 keyword: subscription
930    if eq_ci(b, b"subscription") {
931        return Some(Token::Subscription);
932    }
933    None
934}
935
936/// Lex a `'...'` string literal or `"..."` quoted identifier. The opening
937/// quote sits at `input[start]`; `quote` is its byte value. `is_ident` selects
938/// the resulting token shape.
939///
940/// PG-style doubling escapes the quote: `''` inside `'...'` is a literal `'`,
941/// same for `""` inside `"..."`.
942fn lex_quoted(
943    input: &str,
944    start: usize,
945    quote: u8,
946    is_ident: bool,
947) -> Result<(Token, usize), LexError> {
948    let bytes = input.as_bytes();
949    let mut i = start + 1;
950    let mut s = String::new();
951    loop {
952        if i >= bytes.len() {
953            return Err(LexError {
954                kind: if is_ident {
955                    LexErrorKind::UnterminatedQuotedIdent
956                } else {
957                    LexErrorKind::UnterminatedString
958                },
959                pos: start,
960            });
961        }
962        if bytes[i] == quote {
963            if peek_eq(bytes, i + 1, quote) {
964                s.push(quote as char);
965                i += 2;
966            } else {
967                i += 1;
968                break;
969            }
970        } else {
971            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
972            s.push(ch);
973            i += ch.len_utf8();
974        }
975    }
976    let tok = if is_ident {
977        Token::QuotedIdent(s)
978    } else {
979        Token::String(s)
980    };
981    Ok((tok, i - start))
982}
983
984/// v7.18 — Lex a PG escape-string literal `E'...'`. `start` points
985/// at the opening single quote (the `E` was matched by the caller
986/// and is NOT part of `start`'s offset semantics — the consumed
987/// count returned excludes the `E`, which the caller adds).
988///
989/// Recognised escape sequences:
990///   \\ \' \" — literal backslash / quote
991///   \n \r \t \b \f — standard whitespace controls
992///   \0 — NUL
993///   \xHH — single hex byte (1–2 hex digits)
994///   \NNN — octal byte (1–3 octal digits)
995/// Any other `\X` decodes to the literal byte `X` (PG warns; SPG
996/// follows the lenient behaviour pg_dump output relies on).
997///
998/// Doubled `''` is still a literal `'` (same as the non-E form).
999fn lex_escape_string(input: &str, start: usize) -> Result<(Token, usize), LexError> {
1000    let bytes = input.as_bytes();
1001    debug_assert_eq!(bytes[start], b'\'');
1002    let mut i = start + 1;
1003    let mut s = String::new();
1004    loop {
1005        if i >= bytes.len() {
1006            return Err(LexError {
1007                kind: LexErrorKind::UnterminatedString,
1008                pos: start,
1009            });
1010        }
1011        let b = bytes[i];
1012        if b == b'\'' {
1013            if peek_eq(bytes, i + 1, b'\'') {
1014                s.push('\'');
1015                i += 2;
1016                continue;
1017            }
1018            i += 1;
1019            break;
1020        }
1021        if b == b'\\' && i + 1 < bytes.len() {
1022            let n = bytes[i + 1];
1023            match n {
1024                b'\\' => { s.push('\\'); i += 2; }
1025                b'\'' => { s.push('\''); i += 2; }
1026                b'"' => { s.push('"'); i += 2; }
1027                b'n' => { s.push('\n'); i += 2; }
1028                b'r' => { s.push('\r'); i += 2; }
1029                b't' => { s.push('\t'); i += 2; }
1030                b'b' => { s.push('\u{0008}'); i += 2; }
1031                b'f' => { s.push('\u{000C}'); i += 2; }
1032                b'0' if i + 2 >= bytes.len() || !bytes[i + 2].is_ascii_digit() => {
1033                    s.push('\0');
1034                    i += 2;
1035                }
1036                b'x' => {
1037                    // \xH or \xHH — single byte by hex.
1038                    let h1 = bytes.get(i + 2).copied();
1039                    let h2 = bytes.get(i + 3).copied();
1040                    let n1 = h1.and_then(hex_digit_value);
1041                    let n2 = h2.and_then(hex_digit_value);
1042                    match (n1, n2) {
1043                        (Some(a), Some(b2)) => {
1044                            s.push((((a << 4) | b2) as u8) as char);
1045                            i += 4;
1046                        }
1047                        (Some(a), _) => {
1048                            s.push((a as u8) as char);
1049                            i += 3;
1050                        }
1051                        _ => {
1052                            // \x with no hex follows — literal x.
1053                            s.push('x');
1054                            i += 2;
1055                        }
1056                    }
1057                }
1058                d if d.is_ascii_digit() && d < b'8' => {
1059                    // \NNN octal — up to 3 octal digits.
1060                    let mut value: u32 = u32::from(d - b'0');
1061                    let mut take = 2;
1062                    while take < 4 {
1063                        let next = bytes.get(i + take).copied();
1064                        match next {
1065                            Some(c) if c.is_ascii_digit() && c < b'8' => {
1066                                value = (value << 3) | u32::from(c - b'0');
1067                                take += 1;
1068                            }
1069                            _ => break,
1070                        }
1071                    }
1072                    if let Some(c) = char::from_u32(value) {
1073                        s.push(c);
1074                    } else {
1075                        // Invalid Unicode — preserve as raw byte char.
1076                        s.push((value & 0xFF) as u8 as char);
1077                    }
1078                    i += take;
1079                }
1080                other => {
1081                    // Lenient fallback — same as PG with
1082                    // `standard_conforming_strings = off` warning:
1083                    // decode `\X` to literal `X`.
1084                    s.push(other as char);
1085                    i += 2;
1086                }
1087            }
1088        } else {
1089            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1090            s.push(ch);
1091            i += ch.len_utf8();
1092        }
1093    }
1094    Ok((Token::String(s), i - start))
1095}
1096
1097fn hex_digit_value(b: u8) -> Option<u32> {
1098    match b {
1099        b'0'..=b'9' => Some(u32::from(b - b'0')),
1100        b'a'..=b'f' => Some(u32::from(b - b'a' + 10)),
1101        b'A'..=b'F' => Some(u32::from(b - b'A' + 10)),
1102        _ => None,
1103    }
1104}
1105
1106fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
1107    let bytes = s.as_bytes();
1108    let mut i = 0usize;
1109    let mut is_float = false;
1110
1111    while i < bytes.len() && bytes[i].is_ascii_digit() {
1112        i += 1;
1113    }
1114    if i < bytes.len() && bytes[i] == b'.' {
1115        is_float = true;
1116        i += 1;
1117        while i < bytes.len() && bytes[i].is_ascii_digit() {
1118            i += 1;
1119        }
1120    }
1121    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
1122        is_float = true;
1123        i += 1;
1124        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
1125            i += 1;
1126        }
1127        let exp_start = i;
1128        while i < bytes.len() && bytes[i].is_ascii_digit() {
1129            i += 1;
1130        }
1131        if exp_start == i {
1132            return Err(LexErrorKind::BadNumber(s[..i].to_string()));
1133        }
1134    }
1135
1136    let lit = &s[..i];
1137    if is_float {
1138        lit.parse::<f64>()
1139            .map(|v| (Token::Float(v), i))
1140            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1141    } else {
1142        lit.parse::<i64>()
1143            .map(|v| (Token::Integer(v), i))
1144            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1145    }
1146}
1147
1148#[cfg(test)]
1149mod tests {
1150    use super::*;
1151    use alloc::vec;
1152
1153    fn lex(s: &str) -> Vec<Token> {
1154        tokenize(s).expect("lex ok")
1155    }
1156
1157    #[test]
1158    fn empty_yields_only_eof() {
1159        assert_eq!(lex(""), vec![Token::Eof]);
1160    }
1161
1162    #[test]
1163    fn whitespace_only_yields_only_eof() {
1164        assert_eq!(lex("   \t\n  "), vec![Token::Eof]);
1165    }
1166
1167    #[test]
1168    fn keywords_are_case_insensitive() {
1169        assert_eq!(
1170            lex("SELECT select Select"),
1171            vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1172        );
1173    }
1174
1175    #[test]
1176    fn identifiers_lowercase_ascii() {
1177        assert_eq!(
1178            lex("hello WORLD _x x1"),
1179            vec![
1180                Token::Ident("hello".into()),
1181                Token::Ident("world".into()),
1182                Token::Ident("_x".into()),
1183                Token::Ident("x1".into()),
1184                Token::Eof,
1185            ]
1186        );
1187    }
1188
1189    #[test]
1190    fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1191        assert_eq!(
1192            lex(r#""User Name" "a""b""#),
1193            vec![
1194                Token::QuotedIdent("User Name".into()),
1195                Token::QuotedIdent("a\"b".into()),
1196                Token::Eof,
1197            ]
1198        );
1199    }
1200
1201    #[test]
1202    fn integer_and_float_literals() {
1203        assert_eq!(
1204            lex("0 42 1.5 .5 1e10 2.5e-3"),
1205            vec![
1206                Token::Integer(0),
1207                Token::Integer(42),
1208                Token::Float(1.5),
1209                Token::Float(0.5),
1210                Token::Float(1e10),
1211                Token::Float(2.5e-3),
1212                Token::Eof,
1213            ]
1214        );
1215    }
1216
1217    #[test]
1218    fn negative_number_is_minus_then_integer() {
1219        // PG follows this: unary minus is a separate token, parser folds it.
1220        assert_eq!(
1221            lex("-42"),
1222            vec![Token::Minus, Token::Integer(42), Token::Eof]
1223        );
1224    }
1225
1226    #[test]
1227    fn string_literal_doubled_quote_escape() {
1228        assert_eq!(
1229            lex("'hello' 'it''s'"),
1230            vec![
1231                Token::String("hello".into()),
1232                Token::String("it's".into()),
1233                Token::Eof,
1234            ]
1235        );
1236    }
1237
1238    #[test]
1239    fn all_comparison_and_arithmetic_operators() {
1240        assert_eq!(
1241            lex("= <> != < <= > >= + - * /"),
1242            vec![
1243                Token::Eq,
1244                Token::NotEq,
1245                Token::NotEq,
1246                Token::Lt,
1247                Token::LtEq,
1248                Token::Gt,
1249                Token::GtEq,
1250                Token::Plus,
1251                Token::Minus,
1252                Token::Star,
1253                Token::Slash,
1254                Token::Eof,
1255            ]
1256        );
1257    }
1258
1259    #[test]
1260    fn punctuation() {
1261        assert_eq!(
1262            lex("( ) , ; ."),
1263            vec![
1264                Token::LParen,
1265                Token::RParen,
1266                Token::Comma,
1267                Token::Semicolon,
1268                Token::Dot,
1269                Token::Eof,
1270            ]
1271        );
1272    }
1273
1274    #[test]
1275    fn line_comment_skipped() {
1276        assert_eq!(
1277            lex("SELECT -- trailing junk\nFROM"),
1278            vec![Token::Select, Token::From, Token::Eof]
1279        );
1280    }
1281
1282    #[test]
1283    fn block_comment_skipped() {
1284        assert_eq!(
1285            lex("SELECT /* skipped */ 1"),
1286            vec![Token::Select, Token::Integer(1), Token::Eof]
1287        );
1288    }
1289
1290    #[test]
1291    fn unterminated_string_errors() {
1292        let err = tokenize("'oops").unwrap_err();
1293        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1294        assert_eq!(err.pos, 0);
1295    }
1296
1297    #[test]
1298    fn unterminated_block_comment_errors() {
1299        let err = tokenize("/* never closed").unwrap_err();
1300        assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1301    }
1302
1303    #[test]
1304    fn unknown_char_errors() {
1305        // v7.17.0 Phase 2.6 — `@` standalone now lexes as
1306        // Token::At (mysqldump `'user'@'host'` DEFINER stitching).
1307        // Use `?` for the unknown-char regression; PG `?` operator
1308        // family is parsed as JSON ops in the prefix `?` shape
1309        // would land in lex paths; bare `?` is unknown.
1310        let err = tokenize("\x07").unwrap_err();
1311        assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1312    }
1313
1314    #[test]
1315    fn at_alone_lexes_as_punctuation() {
1316        // v7.17.0 Phase 2.6 — the `'user'@'host'` MySQL DEFINER
1317        // form needs `@` to lex as a standalone token.
1318        assert_eq!(
1319            lex("'u'@'h'"),
1320            vec![
1321                Token::String("u".into()),
1322                Token::At,
1323                Token::String("h".into()),
1324                Token::Eof,
1325            ]
1326        );
1327    }
1328
1329    #[test]
1330    fn dot_in_qualified_column() {
1331        assert_eq!(
1332            lex("t.col"),
1333            vec![
1334                Token::Ident("t".into()),
1335                Token::Dot,
1336                Token::Ident("col".into()),
1337                Token::Eof,
1338            ]
1339        );
1340    }
1341
1342    // --- v0.11 brackets + distance op + vector keyword --------------------
1343
1344    #[test]
1345    fn brackets_are_distinct_tokens() {
1346        assert_eq!(
1347            lex("[ ]"),
1348            vec![Token::LBracket, Token::RBracket, Token::Eof]
1349        );
1350    }
1351
1352    #[test]
1353    fn l2_distance_is_three_char_token() {
1354        assert_eq!(
1355            lex("a <-> b"),
1356            vec![
1357                Token::Ident("a".into()),
1358                Token::L2Distance,
1359                Token::Ident("b".into()),
1360                Token::Eof,
1361            ]
1362        );
1363        // Bare `<-` should NOT match L2Distance.
1364        assert_eq!(
1365            lex("a <- b"),
1366            vec![
1367                Token::Ident("a".into()),
1368                Token::Lt,
1369                Token::Minus,
1370                Token::Ident("b".into()),
1371                Token::Eof,
1372            ]
1373        );
1374    }
1375
1376    #[test]
1377    fn order_by_limit_are_keywords() {
1378        assert_eq!(
1379            lex("ORDER BY LIMIT"),
1380            vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1381        );
1382    }
1383
1384    // --- v1.2: pgvector distance ops + PG cast --------------------------
1385
1386    #[test]
1387    fn inner_product_operator_3char() {
1388        assert_eq!(
1389            lex("a <#> b"),
1390            vec![
1391                Token::Ident("a".into()),
1392                Token::InnerProduct,
1393                Token::Ident("b".into()),
1394                Token::Eof,
1395            ]
1396        );
1397    }
1398
1399    #[test]
1400    fn cosine_distance_operator_3char() {
1401        assert_eq!(
1402            lex("a <=> b"),
1403            vec![
1404                Token::Ident("a".into()),
1405                Token::CosineDistance,
1406                Token::Ident("b".into()),
1407                Token::Eof,
1408            ]
1409        );
1410        // Make sure `<=` and `<>` and `<->` still lex right when `<=>` is
1411        // around (greedy match takes the longest).
1412        assert_eq!(
1413            lex("a <= b"),
1414            vec![
1415                Token::Ident("a".into()),
1416                Token::LtEq,
1417                Token::Ident("b".into()),
1418                Token::Eof,
1419            ]
1420        );
1421    }
1422
1423    #[test]
1424    fn double_colon_cast_token() {
1425        assert_eq!(
1426            lex("x::INT"),
1427            vec![
1428                Token::Ident("x".into()),
1429                Token::DoubleColon,
1430                Token::Ident("int".into()),
1431                Token::Eof,
1432            ]
1433        );
1434    }
1435
1436    #[test]
1437    fn lone_single_colon_lexes_as_colon_token() {
1438        // v7.12.4 — single `:` is now a token (PL/pgSQL surface
1439        // + tsvector external-form literal both need it). The
1440        // pre-v7.12.4 "single colon = unknown char" behaviour
1441        // was incidental.
1442        let toks = tokenize(":x").expect("colon now lexes");
1443        assert_eq!(toks[0], Token::Colon);
1444    }
1445
1446    #[test]
1447    fn colon_eq_lexes_as_assignment() {
1448        // v7.12.4 — PL/pgSQL assignment operator.
1449        let toks = tokenize("x := 1").expect("colon-eq lexes");
1450        // Tokens: Ident("x"), ColonEq, NumberLiteral
1451        assert!(matches!(toks[1], Token::ColonEq));
1452    }
1453
1454    #[test]
1455    fn pg_escape_string_double_backslash_decodes_to_single() {
1456        // v7.18 — E'\\xdeadbeef' decodes to literal `\xdeadbeef`
1457        // (10 chars: backslash + xdeadbeef). The downstream
1458        // `::bytea` cast then reads that as the PG hex-form bytea
1459        // literal. mailrs D-pre #3.
1460        let toks = tokenize(r"E'\\xdeadbeef'").expect("E-string lexes");
1461        assert_eq!(toks, vec![Token::String(r"\xdeadbeef".into()), Token::Eof]);
1462    }
1463
1464    #[test]
1465    fn pg_escape_string_supports_basic_escapes() {
1466        // \n / \t / \' / \\ — the PG standard set.
1467        let toks = tokenize(r"E'a\nb\tc\'d\\e'").expect("E-string lexes");
1468        assert_eq!(
1469            toks,
1470            vec![Token::String("a\nb\tc'd\\e".into()), Token::Eof]
1471        );
1472    }
1473
1474    #[test]
1475    fn pg_escape_string_hex_byte() {
1476        // \xHH single byte. \x41 = 'A'.
1477        let toks = tokenize(r"E'\x41B\x42'").expect("E-string lexes");
1478        assert_eq!(toks, vec![Token::String("ABB".into()), Token::Eof]);
1479    }
1480
1481    #[test]
1482    fn pg_escape_string_lowercase_e_prefix() {
1483        let toks = tokenize(r"e'hi\n'").expect("e-string lexes");
1484        assert_eq!(toks, vec![Token::String("hi\n".into()), Token::Eof]);
1485    }
1486
1487    #[test]
1488    fn pg_escape_string_doubled_quote() {
1489        // Even in E-string the doubled '' is a literal '.
1490        let toks = tokenize(r"E'it''s ok'").expect("E-string lexes");
1491        assert_eq!(toks, vec![Token::String("it's ok".into()), Token::Eof]);
1492    }
1493}
spg_sql/lexer.rs

spg_sql/
lexer.rs