spg_sql/
lexer.rs

1//! Lexer for the PG-dialect subset that SPG accepts.
2//!
3//! v0.2 token stream is value-only — no source spans yet. Errors do report
4//! the byte offset where the offending construct started. Identifiers are
5//! ASCII case-folded to lower-case (matches PG when un-quoted). Quoted
6//! identifiers (`"..."`) preserve case; `""` is an embedded quote.
7//! String literals (`'...'`) follow PG single-quote convention with `''`
8//! as the embedded quote. The lexer accepts but does not interpret E-strings
9//! or dollar-quoted strings — those land in a later milestone.
10
11use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords
18    Select,
19    From,
20    Where,
21    As,
22    Null,
23    True,
24    False,
25    And,
26    Or,
27    Not,
28    Create,
29    Table,
30    Insert,
31    Into,
32    Values,
33    Index,
34    On,
35    Begin,
36    Commit,
37    Rollback,
38    Order,
39    By,
40    Limit,
41
42    // Identifiers
43    Ident(String),       // ASCII case-folded
44    QuotedIdent(String), // original case, "" → "
45    /// v7.14.0 — MySQL session / user variable reference
46    /// (`@VAR` / `@@VAR`). The wrapped string is the verbatim
47    /// source form (including the `@` / `@@` prefix). Used by
48    /// mysqldump preamble (`SET @OLD_FOREIGN_KEY_CHECKS =
49    /// @@FOREIGN_KEY_CHECKS, …`); SPG accepts the token and
50    /// the SET parser treats the assignment as a no-op apart
51    /// from any second LHS that targets a real session
52    /// parameter (e.g. `FOREIGN_KEY_CHECKS=0`).
53    SessionVar(String),
54
55    // Literals
56    Integer(i64),
57    Float(f64),
58    String(String),
59
60    // Operators
61    Plus,
62    Minus,
63    Star,
64    Slash,
65    Eq,
66    NotEq,
67    Lt,
68    LtEq,
69    Gt,
70    GtEq,
71    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contained-in
72    /// `<<`. LHS is strictly inside RHS (no equality).
73    InetContainedBy,
74    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contained-in-or-equal
75    /// `<<=`. LHS network ⊆ RHS network.
76    InetContainedByEq,
77    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contains `>>`.
78    /// LHS strictly contains RHS.
79    InetContains,
80    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contains-or-equal `>>=`.
81    /// LHS network ⊇ RHS network.
82    InetContainsEq,
83    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR network overlap `&&`.
84    /// Either side contains any address of the other.
85    InetOverlap,
86
87    // Punctuation
88    LParen,
89    RParen,
90    LBracket,
91    RBracket,
92    Comma,
93    Semicolon,
94    Dot,
95    /// v7.17.0 Phase 2.6 — standalone `@` punctuation. Emitted when
96    /// `@` is NOT followed by an ident-start byte (i.e. the
97    /// `@VAR` / `@@VAR` SessionVar path doesn't match). Lets the
98    /// parser stitch the MySQL `'user'@'host'` DEFINER form back
99    /// together as String + At + String. Pre-2.6 this same shape
100    /// surfaced as a `LexErrorKind::UnknownChar('@')` and broke
101    /// every mysqldump CREATE VIEW with a DEFINER clause at lex
102    /// time.
103    At,
104    /// pgvector L2 distance operator `<->`. Lexed as one token so the
105    /// parser can give it its own precedence rung.
106    /// v4.14 `->` — JSON object/array element access, returns json.
107    JsonGet,
108    /// v4.14 `->>` — same access, returns text.
109    JsonGetText,
110    /// v6.4.5 `#>` — JSON path walk, returns json. Path is the
111    /// right-hand TEXT with PG `{a,b,0}` syntax.
112    JsonGetPath,
113    /// v6.4.5 `#>>` — same walk, returns text.
114    JsonGetPathText,
115    /// v6.4.5 `@>` — JSON containment. `j @> sub` returns true if
116    /// every key/value in `sub` is present in `j` with structural
117    /// containment for objects + arrays.
118    JsonContains,
119    /// v7.12.2 `@@` — tsvector / tsquery match. Either ordering
120    /// (`vec @@ q` or `q @@ vec`) parses; engine eval normalises
121    /// before matching.
122    TsMatch,
123    L2Distance,
124    /// pgvector inner-product operator `<#>` (returns negative dot product
125    /// so smaller still means more similar — same semantics as pgvector).
126    InnerProduct,
127    /// pgvector cosine distance operator `<=>`.
128    CosineDistance,
129    /// PG-style cast `expr::type` — single token because we want it to bind
130    /// at postfix precedence.
131    DoubleColon,
132    /// v7.12.4 — PL/pgSQL assignment operator `:=`.
133    /// Outside PL/pgSQL bodies this token has no SQL-side meaning.
134    ColonEq,
135    /// v7.12.4 — bare `:` separator. Used inside `tsvector` external-form
136    /// literals (`'cat:1 dog:2'::tsvector`) and as the fallback path for
137    /// the PL/pgSQL assignment lexer.
138    Colon,
139    /// Standard SQL string concatenation `||`.
140    Concat,
141    /// Bitwise OR `|` (single pipe — `||` lexes as Concat first).
142    Pipe,
143    /// Bitwise AND `&` (single amp — `&&` lexes as InetOverlap first).
144    Amp,
145    /// Bitwise NOT `~` (prefix).
146    Tilde,
147    /// `IS` keyword — postfix `IS NULL` / `IS NOT NULL` predicates.
148    Is,
149    Between,
150    In,
151    Like,
152    Group,
153    Distinct,
154    Union,
155    All,
156    Join,
157    Inner,
158    Left,
159    Cross,
160    Outer,
161    Default,
162    Savepoint,
163    Release,
164    To,
165    Having,
166    Show,
167    Extract,
168    Offset,
169    Asc,
170    Desc,
171    /// `INTERVAL` — followed by a string literal carrying the span text
172    /// (e.g. `INTERVAL '1 day 2 hours'`).
173    Interval,
174    /// v6.1.1 — `$N` parameter placeholder for the extended query
175    /// protocol. The number N is 1-based per PostgreSQL convention.
176    /// `0` and `$0` are not valid; the lexer rejects them.
177    Placeholder(u16),
178
179    /// v6.1.2 — `DROP` keyword. Used by `DROP PUBLICATION <name>`.
180    /// Reserved for future `DROP TABLE` / `DROP INDEX` / `DROP USER`
181    /// surface that currently goes through SHOW-shaped admin SQL.
182    Drop,
183    /// v6.1.2 — `FOR` keyword (publication scope).
184    For,
185    /// v6.1.2 — `TABLES` plural keyword (`FOR ALL TABLES`,
186    /// `FOR ALL TABLES EXCEPT …`). The existing `TABLE` keyword
187    /// stays a separate token so `CREATE TABLE`'s single-table
188    /// form keeps lexing as today.
189    Tables,
190    /// v6.1.3 (reserved at v6.1.2 to keep the AST shape stable) —
191    /// `EXCEPT` keyword for `FOR ALL TABLES EXCEPT t1, t2`.
192    Except,
193    /// v6.1.2 — `PUBLICATION` keyword.
194    Publication,
195    /// v6.1.4 (reserved at v6.1.2) — `SUBSCRIPTION` keyword.
196    Subscription,
197    /// v6.1.4 — `CONNECTION` keyword (for
198    /// `CREATE SUBSCRIPTION … CONNECTION '<conn_str>' …`).
199    Connection,
200
201    Eof,
202}
203
204#[derive(Debug, Clone, PartialEq, Eq)]
205pub enum LexErrorKind {
206    UnknownChar(char),
207    UnterminatedString,
208    UnterminatedQuotedIdent,
209    UnterminatedBlockComment,
210    BadNumber(String),
211}
212
213#[derive(Debug, Clone, PartialEq, Eq)]
214pub struct LexError {
215    pub kind: LexErrorKind,
216    pub pos: usize,
217}
218
219impl fmt::Display for LexError {
220    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
221        match &self.kind {
222            LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
223            LexErrorKind::UnterminatedString => {
224                write!(f, "unterminated string literal at byte {}", self.pos)
225            }
226            LexErrorKind::UnterminatedQuotedIdent => {
227                write!(f, "unterminated quoted identifier at byte {}", self.pos)
228            }
229            LexErrorKind::UnterminatedBlockComment => {
230                write!(f, "unterminated /* */ comment at byte {}", self.pos)
231            }
232            LexErrorKind::BadNumber(s) => {
233                write!(f, "invalid number literal {s:?} at byte {}", self.pos)
234            }
235        }
236    }
237}
238
239/// Tokenize `input` into a `Vec<Token>` ending in `Token::Eof`.
240#[allow(clippy::too_many_lines)] // big match — splitting would obscure the dispatch table
241pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
242    let bytes = input.as_bytes();
243    let mut i = 0usize;
244    let mut out = Vec::new();
245
246    while i < bytes.len() {
247        let b = bytes[i];
248        match b {
249            b' ' | b'\t' | b'\n' | b'\r' => {
250                i += 1;
251            }
252            b'-' if peek_eq(bytes, i + 1, b'-') => {
253                i += 2;
254                while i < bytes.len() && bytes[i] != b'\n' {
255                    i += 1;
256                }
257            }
258            b'/' if peek_eq(bytes, i + 1, b'*') => {
259                let start = i;
260                // v7.14.0 — MySQL versioned conditional comment
261                // `/*!NNNNN <body> */`. The body is real SQL that
262                // MySQL/MariaDB executes when the runtime version
263                // matches the 5-digit code; PG strips the whole
264                // thing as a block comment. SPG sides with MySQL
265                // semantics for dump compatibility: skip the
266                // `/*!NNNNN ` prefix and continue lexing the body
267                // as ordinary tokens. The closing `*/` is later
268                // matched + skipped by the symmetric arm below.
269                if peek_eq(bytes, i + 2, b'!') {
270                    let mut j = i + 3;
271                    // skip the optional 5-digit version code +
272                    // following single whitespace
273                    while j < bytes.len() && bytes[j].is_ascii_digit() {
274                        j += 1;
275                    }
276                    if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
277                        j += 1;
278                    }
279                    i = j;
280                    continue;
281                }
282                i += 2;
283                let mut closed = false;
284                while i + 1 < bytes.len() {
285                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
286                        i += 2;
287                        closed = true;
288                        break;
289                    }
290                    i += 1;
291                }
292                if !closed {
293                    return Err(LexError {
294                        kind: LexErrorKind::UnterminatedBlockComment,
295                        pos: start,
296                    });
297                }
298            }
299            // v7.14.0 — bare `*/` (closing of the v7.14 MySQL
300            // versioned-comment opener that didn't consume the
301            // closer). We treat it as an inline comment terminator
302            // and skip 2 bytes.
303            b'*' if peek_eq(bytes, i + 1, b'/') => {
304                i += 2;
305            }
306            b'\'' => {
307                let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
308                out.push(tok);
309                i += consumed;
310            }
311            // v7.18 — PG escape-string literal `E'...'` / `e'...'`.
312            // Closes the mailrs D-pre #3 reverse-acceptance gap:
313            // `INSERT INTO oq VALUES (E'\\xdeadbeef'::bytea)` needs
314            // the `E` prefix so `\\` decodes to a single `\`. The
315            // produced Token::String carries the decoded body so
316            // downstream parser / cast paths treat it identically
317            // to a regular string literal.
318            b'E' | b'e' if peek_eq(bytes, i + 1, b'\'') => {
319                let (tok, consumed) = lex_escape_string(input, i + 1)?;
320                out.push(tok);
321                i += 1 + consumed;
322            }
323            b'"' => {
324                let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
325                out.push(tok);
326                i += consumed;
327            }
328            // MySQL-flavoured backtick-quoted identifier. Same semantics
329            // as the standard `"..."` form, including embedded "``" as
330            // a literal backtick.
331            b'`' => {
332                let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
333                out.push(tok);
334                i += consumed;
335            }
336            b if b.is_ascii_alphabetic() || b == b'_' => {
337                let start = i;
338                i += 1;
339                while i < bytes.len() {
340                    let c = bytes[i];
341                    if c.is_ascii_alphanumeric() || c == b'_' {
342                        i += 1;
343                    } else {
344                        break;
345                    }
346                }
347                let raw = &input[start..i];
348                // v3.0.5: try the keyword table case-insensitively
349                // without allocating; only the ident fall-through
350                // pays for a lowercase String.
351                out.push(keyword_or_ident_raw(raw));
352            }
353            b if b.is_ascii_digit() => {
354                let (tok, consumed) =
355                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
356                out.push(tok);
357                i += consumed;
358            }
359            b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
360                let (tok, consumed) =
361                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
362                out.push(tok);
363                i += consumed;
364            }
365            b'+' => single(&mut out, Token::Plus, &mut i),
366            b'-' => {
367                // v4.14: `->>` and `->` for JSON path access. `->>`
368                // must be tried before `->` (longest match).
369                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
370                    out.push(Token::JsonGetText);
371                    i += 3;
372                } else if peek_eq(bytes, i + 1, b'>') {
373                    out.push(Token::JsonGet);
374                    i += 2;
375                } else {
376                    single(&mut out, Token::Minus, &mut i);
377                }
378            }
379            // v6.4.5: `#>>` and `#>` JSON path walk.
380            b'#' => {
381                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
382                    out.push(Token::JsonGetPathText);
383                    i += 3;
384                } else if peek_eq(bytes, i + 1, b'>') {
385                    out.push(Token::JsonGetPath);
386                    i += 2;
387                } else {
388                    return Err(LexError {
389                        kind: LexErrorKind::UnknownChar('#'),
390                        pos: i,
391                    });
392                }
393            }
394            // v6.4.5: `@>` JSON containment.
395            // v7.12.2: `@@` tsvector / tsquery match.
396            // v7.14.0: `@@NAME` MySQL session variable ref +
397            //          `@NAME` user variable ref. mysqldump preamble
398            //          uses both heavily (`SET @OLD_FOREIGN_KEY_CHECKS
399            //          = @@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0`).
400            //          We lex both as a single SessionVar token so
401            //          the parser can accept and ignore them.
402            b'@' => {
403                if peek_eq(bytes, i + 1, b'>') {
404                    out.push(Token::JsonContains);
405                    i += 2;
406                } else if peek_eq(bytes, i + 1, b'@')
407                    && !is_session_var_ident_start(bytes.get(i + 2).copied())
408                {
409                    // `@@` not followed by an ident-start byte is
410                    // the tsquery `@@` operator.
411                    out.push(Token::TsMatch);
412                    i += 2;
413                } else {
414                    // `@VAR` / `@@VAR` — MySQL user / session
415                    // variable reference. Consume the ident-shaped
416                    // tail and emit as Token::SessionVar so the
417                    // SET parser can accept-and-ignore.
418                    let prefix_end = if peek_eq(bytes, i + 1, b'@') {
419                        i + 2
420                    } else {
421                        i + 1
422                    };
423                    let mut end = prefix_end;
424                    while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
425                        end += 1;
426                    }
427                    if end == prefix_end {
428                        // v7.17.0 Phase 2.6 — `@` not followed by an
429                        // ident-shaped tail. mysqldump's DEFINER
430                        // form `'user'@'host'` lands here (next
431                        // byte is `'`). Emit as Token::At so the
432                        // parser can stitch the surrounding String
433                        // tokens. Single `@@` already short-circuits
434                        // to Token::TsMatch above, so this only
435                        // fires for a true lone `@`.
436                        out.push(Token::At);
437                        i = prefix_end;
438                        continue;
439                    }
440                    out.push(Token::SessionVar(input[i..end].to_string()));
441                    i = end;
442                }
443            }
444            b'*' => single(&mut out, Token::Star, &mut i),
445            b'/' => single(&mut out, Token::Slash, &mut i),
446            b'(' => single(&mut out, Token::LParen, &mut i),
447            b')' => single(&mut out, Token::RParen, &mut i),
448            b'[' => single(&mut out, Token::LBracket, &mut i),
449            b']' => single(&mut out, Token::RBracket, &mut i),
450            b',' => single(&mut out, Token::Comma, &mut i),
451            b';' => single(&mut out, Token::Semicolon, &mut i),
452            b'.' => single(&mut out, Token::Dot, &mut i),
453            b'=' => single(&mut out, Token::Eq, &mut i),
454            b'<' => {
455                if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
456                    out.push(Token::CosineDistance);
457                    i += 3;
458                } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
459                    out.push(Token::InnerProduct);
460                    i += 3;
461                } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
462                    out.push(Token::L2Distance);
463                    i += 3;
464                } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
465                    // v7.17.0 Phase 3.P0-47 — PG INET `<<=` contained-or-equal.
466                    out.push(Token::InetContainedByEq);
467                    i += 3;
468                } else if peek_eq(bytes, i + 1, b'<') {
469                    // v7.17.0 Phase 3.P0-47 — PG INET `<<` strict contained.
470                    out.push(Token::InetContainedBy);
471                    i += 2;
472                } else if peek_eq(bytes, i + 1, b'=') {
473                    out.push(Token::LtEq);
474                    i += 2;
475                } else if peek_eq(bytes, i + 1, b'>') {
476                    out.push(Token::NotEq);
477                    i += 2;
478                } else {
479                    out.push(Token::Lt);
480                    i += 1;
481                }
482            }
483            b':' if peek_eq(bytes, i + 1, b':') => {
484                out.push(Token::DoubleColon);
485                i += 2;
486            }
487            b':' if peek_eq(bytes, i + 1, b'=') => {
488                // v7.12.4 — PL/pgSQL assignment operator `:=`.
489                out.push(Token::ColonEq);
490                i += 2;
491            }
492            b':' => {
493                // v7.12.4 — bare `:`. Used inside `tsvector` external-form
494                // literals which the cast parser consumes in-token, and as a
495                // separator the PL/pgSQL assignment lexer can recover from.
496                out.push(Token::Colon);
497                i += 1;
498            }
499            b'|' if peek_eq(bytes, i + 1, b'|') => {
500                out.push(Token::Concat);
501                i += 2;
502            }
503            // Bitwise operators (PG integer ops; mailrs IMAP flag
504            // masks: `flags | $1`, `flags & ~$1`).
505            b'|' => {
506                single(&mut out, Token::Pipe, &mut i);
507            }
508            b'~' => {
509                single(&mut out, Token::Tilde, &mut i);
510            }
511            b'>' => {
512                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
513                    // v7.17.0 Phase 3.P0-47 — PG INET `>>=` contains-or-equal.
514                    out.push(Token::InetContainsEq);
515                    i += 3;
516                } else if peek_eq(bytes, i + 1, b'>') {
517                    // v7.17.0 Phase 3.P0-47 — PG INET `>>` strict contains.
518                    out.push(Token::InetContains);
519                    i += 2;
520                } else if peek_eq(bytes, i + 1, b'=') {
521                    out.push(Token::GtEq);
522                    i += 2;
523                } else {
524                    out.push(Token::Gt);
525                    i += 1;
526                }
527            }
528            b'&' if peek_eq(bytes, i + 1, b'&') => {
529                // v7.17.0 Phase 3.P0-47 — PG INET network overlap `&&`.
530                out.push(Token::InetOverlap);
531                i += 2;
532            }
533            b'&' => {
534                single(&mut out, Token::Amp, &mut i);
535            }
536            b'!' if peek_eq(bytes, i + 1, b'=') => {
537                out.push(Token::NotEq);
538                i += 2;
539            }
540            // v7.9.27 — PG dollar-quoted string `$$ … $$` (or
541            // `$tag$ … $tag$`). Used in `DO $$ … $$ LANGUAGE
542            // plpgsql;` blocks that pg_dump emits for idempotent
543            // migrations. SPG has no PL/pgSQL, so the lexer
544            // consumes the entire string as a single Token::String
545            // and the parser treats the surrounding `DO …;` as a
546            // no-op. mailrs follow-up H1.
547            b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
548                // Empty tag form: `$$ … $$`.
549                let end = find_dollar_tag_end(bytes, i + 2, b"$$");
550                let body = match end {
551                    Some(e) => &input[i + 2..e],
552                    None => {
553                        return Err(LexError {
554                            kind: LexErrorKind::UnterminatedString,
555                            pos: i,
556                        });
557                    }
558                };
559                out.push(Token::String(body.to_string()));
560                i = end.unwrap() + 2;
561            }
562            b'$' if i + 1 < bytes.len()
563                && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
564            {
565                // Tagged form: `$foo$ … $foo$`. Scan the tag
566                // ident, find the closing copy.
567                let mut j = i + 1;
568                while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
569                    j += 1;
570                }
571                if j >= bytes.len() || bytes[j] != b'$' {
572                    // Not a dollar-quoted string — fall through
573                    // to the generic-unknown-char path.
574                    let ch = input[i..].chars().next().unwrap_or('?');
575                    return Err(LexError {
576                        kind: LexErrorKind::UnknownChar(ch),
577                        pos: i,
578                    });
579                }
580                let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
581                let end = find_dollar_tag_end(bytes, j + 1, &close);
582                let body = match end {
583                    Some(e) => &input[j + 1..e],
584                    None => {
585                        return Err(LexError {
586                            kind: LexErrorKind::UnterminatedString,
587                            pos: i,
588                        });
589                    }
590                };
591                out.push(Token::String(body.to_string()));
592                i = end.unwrap() + close.len();
593            }
594            // v6.1.1: `$N` parameter placeholder for the extended
595            // query protocol. PG numbers them 1..=N; we reject $0
596            // and a bare `$` not followed by a digit.
597            b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
598                let mut j = i + 1;
599                let mut n: u32 = 0;
600                while j < bytes.len() && bytes[j].is_ascii_digit() {
601                    n = n
602                        .saturating_mul(10)
603                        .saturating_add(u32::from(bytes[j] - b'0'));
604                    j += 1;
605                }
606                if n == 0 || n > u32::from(u16::MAX) {
607                    return Err(LexError {
608                        kind: LexErrorKind::BadNumber(input[i..j].to_string()),
609                        pos: i,
610                    });
611                }
612                #[allow(clippy::cast_possible_truncation)]
613                out.push(Token::Placeholder(n as u16));
614                i = j;
615            }
616            _ => {
617                let ch = input[i..].chars().next().unwrap_or('?');
618                return Err(LexError {
619                    kind: LexErrorKind::UnknownChar(ch),
620                    pos: i,
621                });
622            }
623        }
624    }
625    out.push(Token::Eof);
626    Ok(out)
627}
628
629fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
630    bytes.get(i) == Some(&target)
631}
632
633/// v7.14.0 — recognise the first byte of a MySQL session/user
634/// variable name (after `@` or `@@`). PG-strict idents are ASCII
635/// letter or underscore; MySQL also allows leading digits inside
636/// quoted names but unquoted vars match the same shape.
637fn is_session_var_ident_start(b: Option<u8>) -> bool {
638    matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
639}
640
641/// Continuation byte for a `@VAR`/`@@VAR` ident (after the first
642/// alphabet/underscore byte). Letters, digits, underscore, dot
643/// (MySQL allows session-scope qualifiers like
644/// `@@global.sql_mode`) and `$` (some MySQL versions accept it).
645fn is_session_var_ident_continue(b: u8) -> bool {
646    b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
647}
648
649/// v7.9.27 — find the start index of the next occurrence of `tag`
650/// (e.g. `b"$$"` or `b"$foo$"`) in `bytes` starting at `from`.
651fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
652    if tag.is_empty() || from > bytes.len() {
653        return None;
654    }
655    let mut i = from;
656    while i + tag.len() <= bytes.len() {
657        if &bytes[i..i + tag.len()] == tag {
658            return Some(i);
659        }
660        i += 1;
661    }
662    None
663}
664
665fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
666    bytes.get(i).is_some_and(pred)
667}
668
669fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
670    out.push(tok);
671    *i += 1;
672}
673
674/// Length-first ASCII-CI keyword lookup. Avoids allocating a
675/// lowercase `String` when the input matches a keyword; only the ident
676/// fall-through path pays for the lowercase copy.
677///
678/// Grouped by length so the outer `match` becomes a small jump table.
679/// Within a length bucket every keyword has either a unique first
680/// byte (cheap dispatch) or a small set of disambiguating
681/// trailing-byte comparisons. All comparisons are ASCII-CI (XOR
682/// 0x20 on each byte before the compare).
683fn keyword_or_ident_raw(raw: &str) -> Token {
684    let b = raw.as_bytes();
685    let tok = match b.len() {
686        2 => kw_len2(b),
687        3 => kw_len3(b),
688        4 => kw_len4(b),
689        5 => kw_len5(b),
690        6 => kw_len6(b),
691        7 => kw_len7(b),
692        8 => kw_len8(b),
693        9 => kw_len9(b),
694        10 => kw_len10(b),
695        11 => kw_len11(b),
696        12 => kw_len12(b),
697        _ => None,
698    };
699    match tok {
700        Some(t) => t,
701        // Ident fall-through: this is the only path that allocates.
702        None => Token::Ident(raw.to_ascii_lowercase()),
703    }
704}
705
706/// ASCII-CI equality on a byte slice against a lowercase literal.
707/// Letters that differ only in case satisfy `(a ^ b) == 0x20`; other
708/// mismatches set bits outside the 0x20 mask. We compare each byte
709/// against its lowercase form via `to_ascii_lowercase` for clarity;
710/// the compiler folds the loop into a tight cmov chain.
711#[inline]
712fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
713    if input.len() != lower.len() {
714        return false;
715    }
716    for i in 0..lower.len() {
717        if input[i].to_ascii_lowercase() != lower[i] {
718            return false;
719        }
720    }
721    true
722}
723
724#[inline]
725fn kw_len2(b: &[u8]) -> Option<Token> {
726    // 7 keywords: as, by, in, is, on, or, to
727    if eq_ci(b, b"as") {
728        return Some(Token::As);
729    }
730    if eq_ci(b, b"by") {
731        return Some(Token::By);
732    }
733    if eq_ci(b, b"in") {
734        return Some(Token::In);
735    }
736    if eq_ci(b, b"is") {
737        return Some(Token::Is);
738    }
739    if eq_ci(b, b"on") {
740        return Some(Token::On);
741    }
742    if eq_ci(b, b"or") {
743        return Some(Token::Or);
744    }
745    if eq_ci(b, b"to") {
746        return Some(Token::To);
747    }
748    None
749}
750
751#[inline]
752fn kw_len3(b: &[u8]) -> Option<Token> {
753    // 5 keywords: all, and, asc, not, for
754    if eq_ci(b, b"for") {
755        return Some(Token::For);
756    }
757    if eq_ci(b, b"all") {
758        return Some(Token::All);
759    }
760    if eq_ci(b, b"and") {
761        return Some(Token::And);
762    }
763    if eq_ci(b, b"asc") {
764        return Some(Token::Asc);
765    }
766    if eq_ci(b, b"not") {
767        return Some(Token::Not);
768    }
769    None
770}
771
772#[inline]
773fn kw_len4(b: &[u8]) -> Option<Token> {
774    // 10 keywords: from, null, true, into, like, join, left, show, desc, drop
775    if eq_ci(b, b"from") {
776        return Some(Token::From);
777    }
778    if eq_ci(b, b"drop") {
779        return Some(Token::Drop);
780    }
781    if eq_ci(b, b"null") {
782        return Some(Token::Null);
783    }
784    if eq_ci(b, b"true") {
785        return Some(Token::True);
786    }
787    if eq_ci(b, b"into") {
788        return Some(Token::Into);
789    }
790    if eq_ci(b, b"like") {
791        return Some(Token::Like);
792    }
793    if eq_ci(b, b"join") {
794        return Some(Token::Join);
795    }
796    if eq_ci(b, b"left") {
797        return Some(Token::Left);
798    }
799    if eq_ci(b, b"show") {
800        return Some(Token::Show);
801    }
802    if eq_ci(b, b"desc") {
803        return Some(Token::Desc);
804    }
805    None
806}
807
808#[inline]
809fn kw_len5(b: &[u8]) -> Option<Token> {
810    // 12 keywords: false, where, table, index, begin, order, limit,
811    // group, union, inner, cross, outer
812    if eq_ci(b, b"false") {
813        return Some(Token::False);
814    }
815    if eq_ci(b, b"where") {
816        return Some(Token::Where);
817    }
818    if eq_ci(b, b"table") {
819        return Some(Token::Table);
820    }
821    if eq_ci(b, b"index") {
822        return Some(Token::Index);
823    }
824    if eq_ci(b, b"begin") {
825        return Some(Token::Begin);
826    }
827    if eq_ci(b, b"order") {
828        return Some(Token::Order);
829    }
830    if eq_ci(b, b"limit") {
831        return Some(Token::Limit);
832    }
833    if eq_ci(b, b"group") {
834        return Some(Token::Group);
835    }
836    if eq_ci(b, b"union") {
837        return Some(Token::Union);
838    }
839    if eq_ci(b, b"inner") {
840        return Some(Token::Inner);
841    }
842    if eq_ci(b, b"cross") {
843        return Some(Token::Cross);
844    }
845    if eq_ci(b, b"outer") {
846        return Some(Token::Outer);
847    }
848    None
849}
850
851#[inline]
852fn kw_len6(b: &[u8]) -> Option<Token> {
853    // 9 keywords: select, create, insert, values, commit, having, offset, tables, except
854    if eq_ci(b, b"select") {
855        return Some(Token::Select);
856    }
857    if eq_ci(b, b"tables") {
858        return Some(Token::Tables);
859    }
860    if eq_ci(b, b"except") {
861        return Some(Token::Except);
862    }
863    if eq_ci(b, b"create") {
864        return Some(Token::Create);
865    }
866    if eq_ci(b, b"insert") {
867        return Some(Token::Insert);
868    }
869    if eq_ci(b, b"values") {
870        return Some(Token::Values);
871    }
872    if eq_ci(b, b"commit") {
873        return Some(Token::Commit);
874    }
875    if eq_ci(b, b"having") {
876        return Some(Token::Having);
877    }
878    if eq_ci(b, b"offset") {
879        return Some(Token::Offset);
880    }
881    None
882}
883
884#[inline]
885fn kw_len7(b: &[u8]) -> Option<Token> {
886    // 4 keywords: between, default, release, extract
887    if eq_ci(b, b"between") {
888        return Some(Token::Between);
889    }
890    if eq_ci(b, b"default") {
891        return Some(Token::Default);
892    }
893    if eq_ci(b, b"release") {
894        return Some(Token::Release);
895    }
896    if eq_ci(b, b"extract") {
897        return Some(Token::Extract);
898    }
899    None
900}
901
902#[inline]
903fn kw_len8(b: &[u8]) -> Option<Token> {
904    // 3 keywords: rollback, distinct, interval
905    if eq_ci(b, b"rollback") {
906        return Some(Token::Rollback);
907    }
908    if eq_ci(b, b"distinct") {
909        return Some(Token::Distinct);
910    }
911    if eq_ci(b, b"interval") {
912        return Some(Token::Interval);
913    }
914    None
915}
916
917#[inline]
918fn kw_len9(b: &[u8]) -> Option<Token> {
919    // 1 keyword: savepoint
920    if eq_ci(b, b"savepoint") {
921        return Some(Token::Savepoint);
922    }
923    None
924}
925
926#[inline]
927fn kw_len10(b: &[u8]) -> Option<Token> {
928    // 1 keyword: connection
929    if eq_ci(b, b"connection") {
930        return Some(Token::Connection);
931    }
932    None
933}
934
935#[inline]
936fn kw_len11(b: &[u8]) -> Option<Token> {
937    // 1 keyword: publication
938    if eq_ci(b, b"publication") {
939        return Some(Token::Publication);
940    }
941    None
942}
943
944#[inline]
945fn kw_len12(b: &[u8]) -> Option<Token> {
946    // 1 keyword: subscription
947    if eq_ci(b, b"subscription") {
948        return Some(Token::Subscription);
949    }
950    None
951}
952
953/// Lex a `'...'` string literal or `"..."` quoted identifier. The opening
954/// quote sits at `input[start]`; `quote` is its byte value. `is_ident` selects
955/// the resulting token shape.
956///
957/// PG-style doubling escapes the quote: `''` inside `'...'` is a literal `'`,
958/// same for `""` inside `"..."`.
959fn lex_quoted(
960    input: &str,
961    start: usize,
962    quote: u8,
963    is_ident: bool,
964) -> Result<(Token, usize), LexError> {
965    let bytes = input.as_bytes();
966    let mut i = start + 1;
967    let mut s = String::new();
968    loop {
969        if i >= bytes.len() {
970            return Err(LexError {
971                kind: if is_ident {
972                    LexErrorKind::UnterminatedQuotedIdent
973                } else {
974                    LexErrorKind::UnterminatedString
975                },
976                pos: start,
977            });
978        }
979        if bytes[i] == quote {
980            if peek_eq(bytes, i + 1, quote) {
981                s.push(quote as char);
982                i += 2;
983            } else {
984                i += 1;
985                break;
986            }
987        } else {
988            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
989            s.push(ch);
990            i += ch.len_utf8();
991        }
992    }
993    let tok = if is_ident {
994        Token::QuotedIdent(s)
995    } else {
996        Token::String(s)
997    };
998    Ok((tok, i - start))
999}
1000
1001/// v7.18 — Lex a PG escape-string literal `E'...'`. `start` points
1002/// at the opening single quote (the `E` was matched by the caller
1003/// and is NOT part of `start`'s offset semantics — the consumed
1004/// count returned excludes the `E`, which the caller adds).
1005///
1006/// Recognised escape sequences:
1007///   \\ \' \" — literal backslash / quote
1008///   \n \r \t \b \f — standard whitespace controls
1009///   \0 — NUL
1010///   \xHH — single hex byte (1–2 hex digits)
1011///   \NNN — octal byte (1–3 octal digits)
1012/// Any other `\X` decodes to the literal byte `X` (PG warns; SPG
1013/// follows the lenient behaviour pg_dump output relies on).
1014///
1015/// Doubled `''` is still a literal `'` (same as the non-E form).
1016fn lex_escape_string(input: &str, start: usize) -> Result<(Token, usize), LexError> {
1017    let bytes = input.as_bytes();
1018    debug_assert_eq!(bytes[start], b'\'');
1019    let mut i = start + 1;
1020    let mut s = String::new();
1021    loop {
1022        if i >= bytes.len() {
1023            return Err(LexError {
1024                kind: LexErrorKind::UnterminatedString,
1025                pos: start,
1026            });
1027        }
1028        let b = bytes[i];
1029        if b == b'\'' {
1030            if peek_eq(bytes, i + 1, b'\'') {
1031                s.push('\'');
1032                i += 2;
1033                continue;
1034            }
1035            i += 1;
1036            break;
1037        }
1038        if b == b'\\' && i + 1 < bytes.len() {
1039            let n = bytes[i + 1];
1040            match n {
1041                b'\\' => {
1042                    s.push('\\');
1043                    i += 2;
1044                }
1045                b'\'' => {
1046                    s.push('\'');
1047                    i += 2;
1048                }
1049                b'"' => {
1050                    s.push('"');
1051                    i += 2;
1052                }
1053                b'n' => {
1054                    s.push('\n');
1055                    i += 2;
1056                }
1057                b'r' => {
1058                    s.push('\r');
1059                    i += 2;
1060                }
1061                b't' => {
1062                    s.push('\t');
1063                    i += 2;
1064                }
1065                b'b' => {
1066                    s.push('\u{0008}');
1067                    i += 2;
1068                }
1069                b'f' => {
1070                    s.push('\u{000C}');
1071                    i += 2;
1072                }
1073                b'0' if i + 2 >= bytes.len() || !bytes[i + 2].is_ascii_digit() => {
1074                    s.push('\0');
1075                    i += 2;
1076                }
1077                b'x' => {
1078                    // \xH or \xHH — single byte by hex.
1079                    let h1 = bytes.get(i + 2).copied();
1080                    let h2 = bytes.get(i + 3).copied();
1081                    let n1 = h1.and_then(hex_digit_value);
1082                    let n2 = h2.and_then(hex_digit_value);
1083                    match (n1, n2) {
1084                        (Some(a), Some(b2)) => {
1085                            s.push((((a << 4) | b2) as u8) as char);
1086                            i += 4;
1087                        }
1088                        (Some(a), _) => {
1089                            s.push((a as u8) as char);
1090                            i += 3;
1091                        }
1092                        _ => {
1093                            // \x with no hex follows — literal x.
1094                            s.push('x');
1095                            i += 2;
1096                        }
1097                    }
1098                }
1099                d if d.is_ascii_digit() && d < b'8' => {
1100                    // \NNN octal — up to 3 octal digits.
1101                    let mut value: u32 = u32::from(d - b'0');
1102                    let mut take = 2;
1103                    while take < 4 {
1104                        let next = bytes.get(i + take).copied();
1105                        match next {
1106                            Some(c) if c.is_ascii_digit() && c < b'8' => {
1107                                value = (value << 3) | u32::from(c - b'0');
1108                                take += 1;
1109                            }
1110                            _ => break,
1111                        }
1112                    }
1113                    if let Some(c) = char::from_u32(value) {
1114                        s.push(c);
1115                    } else {
1116                        // Invalid Unicode — preserve as raw byte char.
1117                        s.push((value & 0xFF) as u8 as char);
1118                    }
1119                    i += take;
1120                }
1121                other => {
1122                    // Lenient fallback — same as PG with
1123                    // `standard_conforming_strings = off` warning:
1124                    // decode `\X` to literal `X`.
1125                    s.push(other as char);
1126                    i += 2;
1127                }
1128            }
1129        } else {
1130            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1131            s.push(ch);
1132            i += ch.len_utf8();
1133        }
1134    }
1135    Ok((Token::String(s), i - start))
1136}
1137
1138fn hex_digit_value(b: u8) -> Option<u32> {
1139    match b {
1140        b'0'..=b'9' => Some(u32::from(b - b'0')),
1141        b'a'..=b'f' => Some(u32::from(b - b'a' + 10)),
1142        b'A'..=b'F' => Some(u32::from(b - b'A' + 10)),
1143        _ => None,
1144    }
1145}
1146
1147fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
1148    let bytes = s.as_bytes();
1149    let mut i = 0usize;
1150    let mut is_float = false;
1151
1152    while i < bytes.len() && bytes[i].is_ascii_digit() {
1153        i += 1;
1154    }
1155    if i < bytes.len() && bytes[i] == b'.' {
1156        is_float = true;
1157        i += 1;
1158        while i < bytes.len() && bytes[i].is_ascii_digit() {
1159            i += 1;
1160        }
1161    }
1162    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
1163        is_float = true;
1164        i += 1;
1165        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
1166            i += 1;
1167        }
1168        let exp_start = i;
1169        while i < bytes.len() && bytes[i].is_ascii_digit() {
1170            i += 1;
1171        }
1172        if exp_start == i {
1173            return Err(LexErrorKind::BadNumber(s[..i].to_string()));
1174        }
1175    }
1176
1177    let lit = &s[..i];
1178    if is_float {
1179        lit.parse::<f64>()
1180            .map(|v| (Token::Float(v), i))
1181            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1182    } else {
1183        lit.parse::<i64>()
1184            .map(|v| (Token::Integer(v), i))
1185            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1186    }
1187}
1188
1189#[cfg(test)]
1190mod tests {
1191    use super::*;
1192    use alloc::vec;
1193
1194    fn lex(s: &str) -> Vec<Token> {
1195        tokenize(s).expect("lex ok")
1196    }
1197
1198    #[test]
1199    fn empty_yields_only_eof() {
1200        assert_eq!(lex(""), vec![Token::Eof]);
1201    }
1202
1203    #[test]
1204    fn whitespace_only_yields_only_eof() {
1205        assert_eq!(lex("   \t\n  "), vec![Token::Eof]);
1206    }
1207
1208    #[test]
1209    fn keywords_are_case_insensitive() {
1210        assert_eq!(
1211            lex("SELECT select Select"),
1212            vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1213        );
1214    }
1215
1216    #[test]
1217    fn identifiers_lowercase_ascii() {
1218        assert_eq!(
1219            lex("hello WORLD _x x1"),
1220            vec![
1221                Token::Ident("hello".into()),
1222                Token::Ident("world".into()),
1223                Token::Ident("_x".into()),
1224                Token::Ident("x1".into()),
1225                Token::Eof,
1226            ]
1227        );
1228    }
1229
1230    #[test]
1231    fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1232        assert_eq!(
1233            lex(r#""User Name" "a""b""#),
1234            vec![
1235                Token::QuotedIdent("User Name".into()),
1236                Token::QuotedIdent("a\"b".into()),
1237                Token::Eof,
1238            ]
1239        );
1240    }
1241
1242    #[test]
1243    fn integer_and_float_literals() {
1244        assert_eq!(
1245            lex("0 42 1.5 .5 1e10 2.5e-3"),
1246            vec![
1247                Token::Integer(0),
1248                Token::Integer(42),
1249                Token::Float(1.5),
1250                Token::Float(0.5),
1251                Token::Float(1e10),
1252                Token::Float(2.5e-3),
1253                Token::Eof,
1254            ]
1255        );
1256    }
1257
1258    #[test]
1259    fn negative_number_is_minus_then_integer() {
1260        // PG follows this: unary minus is a separate token, parser folds it.
1261        assert_eq!(
1262            lex("-42"),
1263            vec![Token::Minus, Token::Integer(42), Token::Eof]
1264        );
1265    }
1266
1267    #[test]
1268    fn string_literal_doubled_quote_escape() {
1269        assert_eq!(
1270            lex("'hello' 'it''s'"),
1271            vec![
1272                Token::String("hello".into()),
1273                Token::String("it's".into()),
1274                Token::Eof,
1275            ]
1276        );
1277    }
1278
1279    #[test]
1280    fn all_comparison_and_arithmetic_operators() {
1281        assert_eq!(
1282            lex("= <> != < <= > >= + - * /"),
1283            vec![
1284                Token::Eq,
1285                Token::NotEq,
1286                Token::NotEq,
1287                Token::Lt,
1288                Token::LtEq,
1289                Token::Gt,
1290                Token::GtEq,
1291                Token::Plus,
1292                Token::Minus,
1293                Token::Star,
1294                Token::Slash,
1295                Token::Eof,
1296            ]
1297        );
1298    }
1299
1300    #[test]
1301    fn punctuation() {
1302        assert_eq!(
1303            lex("( ) , ; ."),
1304            vec![
1305                Token::LParen,
1306                Token::RParen,
1307                Token::Comma,
1308                Token::Semicolon,
1309                Token::Dot,
1310                Token::Eof,
1311            ]
1312        );
1313    }
1314
1315    #[test]
1316    fn line_comment_skipped() {
1317        assert_eq!(
1318            lex("SELECT -- trailing junk\nFROM"),
1319            vec![Token::Select, Token::From, Token::Eof]
1320        );
1321    }
1322
1323    #[test]
1324    fn block_comment_skipped() {
1325        assert_eq!(
1326            lex("SELECT /* skipped */ 1"),
1327            vec![Token::Select, Token::Integer(1), Token::Eof]
1328        );
1329    }
1330
1331    #[test]
1332    fn unterminated_string_errors() {
1333        let err = tokenize("'oops").unwrap_err();
1334        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1335        assert_eq!(err.pos, 0);
1336    }
1337
1338    #[test]
1339    fn unterminated_block_comment_errors() {
1340        let err = tokenize("/* never closed").unwrap_err();
1341        assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1342    }
1343
1344    #[test]
1345    fn unknown_char_errors() {
1346        // v7.17.0 Phase 2.6 — `@` standalone now lexes as
1347        // Token::At (mysqldump `'user'@'host'` DEFINER stitching).
1348        // Use `?` for the unknown-char regression; PG `?` operator
1349        // family is parsed as JSON ops in the prefix `?` shape
1350        // would land in lex paths; bare `?` is unknown.
1351        let err = tokenize("\x07").unwrap_err();
1352        assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1353    }
1354
1355    #[test]
1356    fn at_alone_lexes_as_punctuation() {
1357        // v7.17.0 Phase 2.6 — the `'user'@'host'` MySQL DEFINER
1358        // form needs `@` to lex as a standalone token.
1359        assert_eq!(
1360            lex("'u'@'h'"),
1361            vec![
1362                Token::String("u".into()),
1363                Token::At,
1364                Token::String("h".into()),
1365                Token::Eof,
1366            ]
1367        );
1368    }
1369
1370    #[test]
1371    fn dot_in_qualified_column() {
1372        assert_eq!(
1373            lex("t.col"),
1374            vec![
1375                Token::Ident("t".into()),
1376                Token::Dot,
1377                Token::Ident("col".into()),
1378                Token::Eof,
1379            ]
1380        );
1381    }
1382
1383    // --- v0.11 brackets + distance op + vector keyword --------------------
1384
1385    #[test]
1386    fn brackets_are_distinct_tokens() {
1387        assert_eq!(
1388            lex("[ ]"),
1389            vec![Token::LBracket, Token::RBracket, Token::Eof]
1390        );
1391    }
1392
1393    #[test]
1394    fn l2_distance_is_three_char_token() {
1395        assert_eq!(
1396            lex("a <-> b"),
1397            vec![
1398                Token::Ident("a".into()),
1399                Token::L2Distance,
1400                Token::Ident("b".into()),
1401                Token::Eof,
1402            ]
1403        );
1404        // Bare `<-` should NOT match L2Distance.
1405        assert_eq!(
1406            lex("a <- b"),
1407            vec![
1408                Token::Ident("a".into()),
1409                Token::Lt,
1410                Token::Minus,
1411                Token::Ident("b".into()),
1412                Token::Eof,
1413            ]
1414        );
1415    }
1416
1417    #[test]
1418    fn order_by_limit_are_keywords() {
1419        assert_eq!(
1420            lex("ORDER BY LIMIT"),
1421            vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1422        );
1423    }
1424
1425    // --- v1.2: pgvector distance ops + PG cast --------------------------
1426
1427    #[test]
1428    fn inner_product_operator_3char() {
1429        assert_eq!(
1430            lex("a <#> b"),
1431            vec![
1432                Token::Ident("a".into()),
1433                Token::InnerProduct,
1434                Token::Ident("b".into()),
1435                Token::Eof,
1436            ]
1437        );
1438    }
1439
1440    #[test]
1441    fn cosine_distance_operator_3char() {
1442        assert_eq!(
1443            lex("a <=> b"),
1444            vec![
1445                Token::Ident("a".into()),
1446                Token::CosineDistance,
1447                Token::Ident("b".into()),
1448                Token::Eof,
1449            ]
1450        );
1451        // Make sure `<=` and `<>` and `<->` still lex right when `<=>` is
1452        // around (greedy match takes the longest).
1453        assert_eq!(
1454            lex("a <= b"),
1455            vec![
1456                Token::Ident("a".into()),
1457                Token::LtEq,
1458                Token::Ident("b".into()),
1459                Token::Eof,
1460            ]
1461        );
1462    }
1463
1464    #[test]
1465    fn double_colon_cast_token() {
1466        assert_eq!(
1467            lex("x::INT"),
1468            vec![
1469                Token::Ident("x".into()),
1470                Token::DoubleColon,
1471                Token::Ident("int".into()),
1472                Token::Eof,
1473            ]
1474        );
1475    }
1476
1477    #[test]
1478    fn lone_single_colon_lexes_as_colon_token() {
1479        // v7.12.4 — single `:` is now a token (PL/pgSQL surface
1480        // + tsvector external-form literal both need it). The
1481        // pre-v7.12.4 "single colon = unknown char" behaviour
1482        // was incidental.
1483        let toks = tokenize(":x").expect("colon now lexes");
1484        assert_eq!(toks[0], Token::Colon);
1485    }
1486
1487    #[test]
1488    fn colon_eq_lexes_as_assignment() {
1489        // v7.12.4 — PL/pgSQL assignment operator.
1490        let toks = tokenize("x := 1").expect("colon-eq lexes");
1491        // Tokens: Ident("x"), ColonEq, NumberLiteral
1492        assert!(matches!(toks[1], Token::ColonEq));
1493    }
1494
1495    #[test]
1496    fn pg_escape_string_double_backslash_decodes_to_single() {
1497        // v7.18 — E'\\xdeadbeef' decodes to literal `\xdeadbeef`
1498        // (10 chars: backslash + xdeadbeef). The downstream
1499        // `::bytea` cast then reads that as the PG hex-form bytea
1500        // literal. mailrs D-pre #3.
1501        let toks = tokenize(r"E'\\xdeadbeef'").expect("E-string lexes");
1502        assert_eq!(toks, vec![Token::String(r"\xdeadbeef".into()), Token::Eof]);
1503    }
1504
1505    #[test]
1506    fn pg_escape_string_supports_basic_escapes() {
1507        // \n / \t / \' / \\ — the PG standard set.
1508        let toks = tokenize(r"E'a\nb\tc\'d\\e'").expect("E-string lexes");
1509        assert_eq!(toks, vec![Token::String("a\nb\tc'd\\e".into()), Token::Eof]);
1510    }
1511
1512    #[test]
1513    fn pg_escape_string_hex_byte() {
1514        // \xHH single byte. \x41 = 'A'.
1515        let toks = tokenize(r"E'\x41B\x42'").expect("E-string lexes");
1516        assert_eq!(toks, vec![Token::String("ABB".into()), Token::Eof]);
1517    }
1518
1519    #[test]
1520    fn pg_escape_string_lowercase_e_prefix() {
1521        let toks = tokenize(r"e'hi\n'").expect("e-string lexes");
1522        assert_eq!(toks, vec![Token::String("hi\n".into()), Token::Eof]);
1523    }
1524
1525    #[test]
1526    fn pg_escape_string_doubled_quote() {
1527        // Even in E-string the doubled '' is a literal '.
1528        let toks = tokenize(r"E'it''s ok'").expect("E-string lexes");
1529        assert_eq!(toks, vec![Token::String("it's ok".into()), Token::Eof]);
1530    }
1531}
spg_sql/lexer.rs

spg_sql/
lexer.rs