spg_sql/
lexer.rs

1//! Lexer for the PG-dialect subset that SPG accepts.
2//!
3//! v0.2 token stream is value-only — no source spans yet. Errors do report
4//! the byte offset where the offending construct started. Identifiers are
5//! ASCII case-folded to lower-case (matches PG when un-quoted). Quoted
6//! identifiers (`"..."`) preserve case; `""` is an embedded quote.
7//! String literals (`'...'`) follow PG single-quote convention with `''`
8//! as the embedded quote. The lexer accepts but does not interpret E-strings
9//! or dollar-quoted strings — those land in a later milestone.
10
11use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords
18    Select,
19    From,
20    Where,
21    As,
22    Null,
23    True,
24    False,
25    And,
26    Or,
27    Not,
28    Create,
29    Table,
30    Insert,
31    Into,
32    Values,
33    Index,
34    On,
35    Begin,
36    Commit,
37    Rollback,
38    Order,
39    By,
40    Limit,
41
42    // Identifiers
43    Ident(String),       // ASCII case-folded
44    QuotedIdent(String), // original case, "" → "
45    /// v7.14.0 — MySQL session / user variable reference
46    /// (`@VAR` / `@@VAR`). The wrapped string is the verbatim
47    /// source form (including the `@` / `@@` prefix). Used by
48    /// mysqldump preamble (`SET @OLD_FOREIGN_KEY_CHECKS =
49    /// @@FOREIGN_KEY_CHECKS, …`); SPG accepts the token and
50    /// the SET parser treats the assignment as a no-op apart
51    /// from any second LHS that targets a real session
52    /// parameter (e.g. `FOREIGN_KEY_CHECKS=0`).
53    SessionVar(String),
54
55    // Literals
56    Integer(i64),
57    Float(f64),
58    String(String),
59
60    // Operators
61    Plus,
62    Minus,
63    Star,
64    Slash,
65    Eq,
66    NotEq,
67    Lt,
68    LtEq,
69    Gt,
70    GtEq,
71    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contained-in
72    /// `<<`. LHS is strictly inside RHS (no equality).
73    InetContainedBy,
74    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contained-in-or-equal
75    /// `<<=`. LHS network ⊆ RHS network.
76    InetContainedByEq,
77    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR strict contains `>>`.
78    /// LHS strictly contains RHS.
79    InetContains,
80    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR contains-or-equal `>>=`.
81    /// LHS network ⊇ RHS network.
82    InetContainsEq,
83    /// v7.17.0 Phase 3.P0-47 — PG INET / CIDR network overlap `&&`.
84    /// Either side contains any address of the other.
85    InetOverlap,
86
87    // Punctuation
88    LParen,
89    RParen,
90    LBracket,
91    RBracket,
92    Comma,
93    Semicolon,
94    Dot,
95    /// v7.17.0 Phase 2.6 — standalone `@` punctuation. Emitted when
96    /// `@` is NOT followed by an ident-start byte (i.e. the
97    /// `@VAR` / `@@VAR` SessionVar path doesn't match). Lets the
98    /// parser stitch the MySQL `'user'@'host'` DEFINER form back
99    /// together as String + At + String. Pre-2.6 this same shape
100    /// surfaced as a `LexErrorKind::UnknownChar('@')` and broke
101    /// every mysqldump CREATE VIEW with a DEFINER clause at lex
102    /// time.
103    At,
104    /// pgvector L2 distance operator `<->`. Lexed as one token so the
105    /// parser can give it its own precedence rung.
106    /// v4.14 `->` — JSON object/array element access, returns json.
107    JsonGet,
108    /// v4.14 `->>` — same access, returns text.
109    JsonGetText,
110    /// v6.4.5 `#>` — JSON path walk, returns json. Path is the
111    /// right-hand TEXT with PG `{a,b,0}` syntax.
112    JsonGetPath,
113    /// v6.4.5 `#>>` — same walk, returns text.
114    JsonGetPathText,
115    /// v6.4.5 `@>` — JSON containment. `j @> sub` returns true if
116    /// every key/value in `sub` is present in `j` with structural
117    /// containment for objects + arrays.
118    JsonContains,
119    /// v7.12.2 `@@` — tsvector / tsquery match. Either ordering
120    /// (`vec @@ q` or `q @@ vec`) parses; engine eval normalises
121    /// before matching.
122    TsMatch,
123    L2Distance,
124    /// pgvector inner-product operator `<#>` (returns negative dot product
125    /// so smaller still means more similar — same semantics as pgvector).
126    InnerProduct,
127    /// pgvector cosine distance operator `<=>`.
128    CosineDistance,
129    /// PG-style cast `expr::type` — single token because we want it to bind
130    /// at postfix precedence.
131    DoubleColon,
132    /// v7.12.4 — PL/pgSQL assignment operator `:=`.
133    /// Outside PL/pgSQL bodies this token has no SQL-side meaning.
134    ColonEq,
135    /// v7.12.4 — bare `:` separator. Used inside `tsvector` external-form
136    /// literals (`'cat:1 dog:2'::tsvector`) and as the fallback path for
137    /// the PL/pgSQL assignment lexer.
138    Colon,
139    /// Standard SQL string concatenation `||`.
140    Concat,
141    /// `IS` keyword — postfix `IS NULL` / `IS NOT NULL` predicates.
142    Is,
143    Between,
144    In,
145    Like,
146    Group,
147    Distinct,
148    Union,
149    All,
150    Join,
151    Inner,
152    Left,
153    Cross,
154    Outer,
155    Default,
156    Savepoint,
157    Release,
158    To,
159    Having,
160    Show,
161    Extract,
162    Offset,
163    Asc,
164    Desc,
165    /// `INTERVAL` — followed by a string literal carrying the span text
166    /// (e.g. `INTERVAL '1 day 2 hours'`).
167    Interval,
168    /// v6.1.1 — `$N` parameter placeholder for the extended query
169    /// protocol. The number N is 1-based per PostgreSQL convention.
170    /// `0` and `$0` are not valid; the lexer rejects them.
171    Placeholder(u16),
172
173    /// v6.1.2 — `DROP` keyword. Used by `DROP PUBLICATION <name>`.
174    /// Reserved for future `DROP TABLE` / `DROP INDEX` / `DROP USER`
175    /// surface that currently goes through SHOW-shaped admin SQL.
176    Drop,
177    /// v6.1.2 — `FOR` keyword (publication scope).
178    For,
179    /// v6.1.2 — `TABLES` plural keyword (`FOR ALL TABLES`,
180    /// `FOR ALL TABLES EXCEPT …`). The existing `TABLE` keyword
181    /// stays a separate token so `CREATE TABLE`'s single-table
182    /// form keeps lexing as today.
183    Tables,
184    /// v6.1.3 (reserved at v6.1.2 to keep the AST shape stable) —
185    /// `EXCEPT` keyword for `FOR ALL TABLES EXCEPT t1, t2`.
186    Except,
187    /// v6.1.2 — `PUBLICATION` keyword.
188    Publication,
189    /// v6.1.4 (reserved at v6.1.2) — `SUBSCRIPTION` keyword.
190    Subscription,
191    /// v6.1.4 — `CONNECTION` keyword (for
192    /// `CREATE SUBSCRIPTION … CONNECTION '<conn_str>' …`).
193    Connection,
194
195    Eof,
196}
197
198#[derive(Debug, Clone, PartialEq, Eq)]
199pub enum LexErrorKind {
200    UnknownChar(char),
201    UnterminatedString,
202    UnterminatedQuotedIdent,
203    UnterminatedBlockComment,
204    BadNumber(String),
205}
206
207#[derive(Debug, Clone, PartialEq, Eq)]
208pub struct LexError {
209    pub kind: LexErrorKind,
210    pub pos: usize,
211}
212
213impl fmt::Display for LexError {
214    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
215        match &self.kind {
216            LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
217            LexErrorKind::UnterminatedString => {
218                write!(f, "unterminated string literal at byte {}", self.pos)
219            }
220            LexErrorKind::UnterminatedQuotedIdent => {
221                write!(f, "unterminated quoted identifier at byte {}", self.pos)
222            }
223            LexErrorKind::UnterminatedBlockComment => {
224                write!(f, "unterminated /* */ comment at byte {}", self.pos)
225            }
226            LexErrorKind::BadNumber(s) => {
227                write!(f, "invalid number literal {s:?} at byte {}", self.pos)
228            }
229        }
230    }
231}
232
233/// Tokenize `input` into a `Vec<Token>` ending in `Token::Eof`.
234#[allow(clippy::too_many_lines)] // big match — splitting would obscure the dispatch table
235pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
236    let bytes = input.as_bytes();
237    let mut i = 0usize;
238    let mut out = Vec::new();
239
240    while i < bytes.len() {
241        let b = bytes[i];
242        match b {
243            b' ' | b'\t' | b'\n' | b'\r' => {
244                i += 1;
245            }
246            b'-' if peek_eq(bytes, i + 1, b'-') => {
247                i += 2;
248                while i < bytes.len() && bytes[i] != b'\n' {
249                    i += 1;
250                }
251            }
252            b'/' if peek_eq(bytes, i + 1, b'*') => {
253                let start = i;
254                // v7.14.0 — MySQL versioned conditional comment
255                // `/*!NNNNN <body> */`. The body is real SQL that
256                // MySQL/MariaDB executes when the runtime version
257                // matches the 5-digit code; PG strips the whole
258                // thing as a block comment. SPG sides with MySQL
259                // semantics for dump compatibility: skip the
260                // `/*!NNNNN ` prefix and continue lexing the body
261                // as ordinary tokens. The closing `*/` is later
262                // matched + skipped by the symmetric arm below.
263                if peek_eq(bytes, i + 2, b'!') {
264                    let mut j = i + 3;
265                    // skip the optional 5-digit version code +
266                    // following single whitespace
267                    while j < bytes.len() && bytes[j].is_ascii_digit() {
268                        j += 1;
269                    }
270                    if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
271                        j += 1;
272                    }
273                    i = j;
274                    continue;
275                }
276                i += 2;
277                let mut closed = false;
278                while i + 1 < bytes.len() {
279                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
280                        i += 2;
281                        closed = true;
282                        break;
283                    }
284                    i += 1;
285                }
286                if !closed {
287                    return Err(LexError {
288                        kind: LexErrorKind::UnterminatedBlockComment,
289                        pos: start,
290                    });
291                }
292            }
293            // v7.14.0 — bare `*/` (closing of the v7.14 MySQL
294            // versioned-comment opener that didn't consume the
295            // closer). We treat it as an inline comment terminator
296            // and skip 2 bytes.
297            b'*' if peek_eq(bytes, i + 1, b'/') => {
298                i += 2;
299            }
300            b'\'' => {
301                let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
302                out.push(tok);
303                i += consumed;
304            }
305            // v7.18 — PG escape-string literal `E'...'` / `e'...'`.
306            // Closes the mailrs D-pre #3 reverse-acceptance gap:
307            // `INSERT INTO oq VALUES (E'\\xdeadbeef'::bytea)` needs
308            // the `E` prefix so `\\` decodes to a single `\`. The
309            // produced Token::String carries the decoded body so
310            // downstream parser / cast paths treat it identically
311            // to a regular string literal.
312            b'E' | b'e' if peek_eq(bytes, i + 1, b'\'') => {
313                let (tok, consumed) = lex_escape_string(input, i + 1)?;
314                out.push(tok);
315                i += 1 + consumed;
316            }
317            b'"' => {
318                let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
319                out.push(tok);
320                i += consumed;
321            }
322            // MySQL-flavoured backtick-quoted identifier. Same semantics
323            // as the standard `"..."` form, including embedded "``" as
324            // a literal backtick.
325            b'`' => {
326                let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
327                out.push(tok);
328                i += consumed;
329            }
330            b if b.is_ascii_alphabetic() || b == b'_' => {
331                let start = i;
332                i += 1;
333                while i < bytes.len() {
334                    let c = bytes[i];
335                    if c.is_ascii_alphanumeric() || c == b'_' {
336                        i += 1;
337                    } else {
338                        break;
339                    }
340                }
341                let raw = &input[start..i];
342                // v3.0.5: try the keyword table case-insensitively
343                // without allocating; only the ident fall-through
344                // pays for a lowercase String.
345                out.push(keyword_or_ident_raw(raw));
346            }
347            b if b.is_ascii_digit() => {
348                let (tok, consumed) =
349                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
350                out.push(tok);
351                i += consumed;
352            }
353            b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
354                let (tok, consumed) =
355                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
356                out.push(tok);
357                i += consumed;
358            }
359            b'+' => single(&mut out, Token::Plus, &mut i),
360            b'-' => {
361                // v4.14: `->>` and `->` for JSON path access. `->>`
362                // must be tried before `->` (longest match).
363                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
364                    out.push(Token::JsonGetText);
365                    i += 3;
366                } else if peek_eq(bytes, i + 1, b'>') {
367                    out.push(Token::JsonGet);
368                    i += 2;
369                } else {
370                    single(&mut out, Token::Minus, &mut i);
371                }
372            }
373            // v6.4.5: `#>>` and `#>` JSON path walk.
374            b'#' => {
375                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
376                    out.push(Token::JsonGetPathText);
377                    i += 3;
378                } else if peek_eq(bytes, i + 1, b'>') {
379                    out.push(Token::JsonGetPath);
380                    i += 2;
381                } else {
382                    return Err(LexError {
383                        kind: LexErrorKind::UnknownChar('#'),
384                        pos: i,
385                    });
386                }
387            }
388            // v6.4.5: `@>` JSON containment.
389            // v7.12.2: `@@` tsvector / tsquery match.
390            // v7.14.0: `@@NAME` MySQL session variable ref +
391            //          `@NAME` user variable ref. mysqldump preamble
392            //          uses both heavily (`SET @OLD_FOREIGN_KEY_CHECKS
393            //          = @@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0`).
394            //          We lex both as a single SessionVar token so
395            //          the parser can accept and ignore them.
396            b'@' => {
397                if peek_eq(bytes, i + 1, b'>') {
398                    out.push(Token::JsonContains);
399                    i += 2;
400                } else if peek_eq(bytes, i + 1, b'@')
401                    && !is_session_var_ident_start(bytes.get(i + 2).copied())
402                {
403                    // `@@` not followed by an ident-start byte is
404                    // the tsquery `@@` operator.
405                    out.push(Token::TsMatch);
406                    i += 2;
407                } else {
408                    // `@VAR` / `@@VAR` — MySQL user / session
409                    // variable reference. Consume the ident-shaped
410                    // tail and emit as Token::SessionVar so the
411                    // SET parser can accept-and-ignore.
412                    let prefix_end = if peek_eq(bytes, i + 1, b'@') {
413                        i + 2
414                    } else {
415                        i + 1
416                    };
417                    let mut end = prefix_end;
418                    while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
419                        end += 1;
420                    }
421                    if end == prefix_end {
422                        // v7.17.0 Phase 2.6 — `@` not followed by an
423                        // ident-shaped tail. mysqldump's DEFINER
424                        // form `'user'@'host'` lands here (next
425                        // byte is `'`). Emit as Token::At so the
426                        // parser can stitch the surrounding String
427                        // tokens. Single `@@` already short-circuits
428                        // to Token::TsMatch above, so this only
429                        // fires for a true lone `@`.
430                        out.push(Token::At);
431                        i = prefix_end;
432                        continue;
433                    }
434                    out.push(Token::SessionVar(input[i..end].to_string()));
435                    i = end;
436                }
437            }
438            b'*' => single(&mut out, Token::Star, &mut i),
439            b'/' => single(&mut out, Token::Slash, &mut i),
440            b'(' => single(&mut out, Token::LParen, &mut i),
441            b')' => single(&mut out, Token::RParen, &mut i),
442            b'[' => single(&mut out, Token::LBracket, &mut i),
443            b']' => single(&mut out, Token::RBracket, &mut i),
444            b',' => single(&mut out, Token::Comma, &mut i),
445            b';' => single(&mut out, Token::Semicolon, &mut i),
446            b'.' => single(&mut out, Token::Dot, &mut i),
447            b'=' => single(&mut out, Token::Eq, &mut i),
448            b'<' => {
449                if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
450                    out.push(Token::CosineDistance);
451                    i += 3;
452                } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
453                    out.push(Token::InnerProduct);
454                    i += 3;
455                } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
456                    out.push(Token::L2Distance);
457                    i += 3;
458                } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
459                    // v7.17.0 Phase 3.P0-47 — PG INET `<<=` contained-or-equal.
460                    out.push(Token::InetContainedByEq);
461                    i += 3;
462                } else if peek_eq(bytes, i + 1, b'<') {
463                    // v7.17.0 Phase 3.P0-47 — PG INET `<<` strict contained.
464                    out.push(Token::InetContainedBy);
465                    i += 2;
466                } else if peek_eq(bytes, i + 1, b'=') {
467                    out.push(Token::LtEq);
468                    i += 2;
469                } else if peek_eq(bytes, i + 1, b'>') {
470                    out.push(Token::NotEq);
471                    i += 2;
472                } else {
473                    out.push(Token::Lt);
474                    i += 1;
475                }
476            }
477            b':' if peek_eq(bytes, i + 1, b':') => {
478                out.push(Token::DoubleColon);
479                i += 2;
480            }
481            b':' if peek_eq(bytes, i + 1, b'=') => {
482                // v7.12.4 — PL/pgSQL assignment operator `:=`.
483                out.push(Token::ColonEq);
484                i += 2;
485            }
486            b':' => {
487                // v7.12.4 — bare `:`. Used inside `tsvector` external-form
488                // literals which the cast parser consumes in-token, and as a
489                // separator the PL/pgSQL assignment lexer can recover from.
490                out.push(Token::Colon);
491                i += 1;
492            }
493            b'|' if peek_eq(bytes, i + 1, b'|') => {
494                out.push(Token::Concat);
495                i += 2;
496            }
497            b'>' => {
498                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
499                    // v7.17.0 Phase 3.P0-47 — PG INET `>>=` contains-or-equal.
500                    out.push(Token::InetContainsEq);
501                    i += 3;
502                } else if peek_eq(bytes, i + 1, b'>') {
503                    // v7.17.0 Phase 3.P0-47 — PG INET `>>` strict contains.
504                    out.push(Token::InetContains);
505                    i += 2;
506                } else if peek_eq(bytes, i + 1, b'=') {
507                    out.push(Token::GtEq);
508                    i += 2;
509                } else {
510                    out.push(Token::Gt);
511                    i += 1;
512                }
513            }
514            b'&' if peek_eq(bytes, i + 1, b'&') => {
515                // v7.17.0 Phase 3.P0-47 — PG INET network overlap `&&`.
516                out.push(Token::InetOverlap);
517                i += 2;
518            }
519            b'!' if peek_eq(bytes, i + 1, b'=') => {
520                out.push(Token::NotEq);
521                i += 2;
522            }
523            // v7.9.27 — PG dollar-quoted string `$$ … $$` (or
524            // `$tag$ … $tag$`). Used in `DO $$ … $$ LANGUAGE
525            // plpgsql;` blocks that pg_dump emits for idempotent
526            // migrations. SPG has no PL/pgSQL, so the lexer
527            // consumes the entire string as a single Token::String
528            // and the parser treats the surrounding `DO …;` as a
529            // no-op. mailrs follow-up H1.
530            b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
531                // Empty tag form: `$$ … $$`.
532                let end = find_dollar_tag_end(bytes, i + 2, b"$$");
533                let body = match end {
534                    Some(e) => &input[i + 2..e],
535                    None => {
536                        return Err(LexError {
537                            kind: LexErrorKind::UnterminatedString,
538                            pos: i,
539                        });
540                    }
541                };
542                out.push(Token::String(body.to_string()));
543                i = end.unwrap() + 2;
544            }
545            b'$' if i + 1 < bytes.len()
546                && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
547            {
548                // Tagged form: `$foo$ … $foo$`. Scan the tag
549                // ident, find the closing copy.
550                let mut j = i + 1;
551                while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
552                    j += 1;
553                }
554                if j >= bytes.len() || bytes[j] != b'$' {
555                    // Not a dollar-quoted string — fall through
556                    // to the generic-unknown-char path.
557                    let ch = input[i..].chars().next().unwrap_or('?');
558                    return Err(LexError {
559                        kind: LexErrorKind::UnknownChar(ch),
560                        pos: i,
561                    });
562                }
563                let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
564                let end = find_dollar_tag_end(bytes, j + 1, &close);
565                let body = match end {
566                    Some(e) => &input[j + 1..e],
567                    None => {
568                        return Err(LexError {
569                            kind: LexErrorKind::UnterminatedString,
570                            pos: i,
571                        });
572                    }
573                };
574                out.push(Token::String(body.to_string()));
575                i = end.unwrap() + close.len();
576            }
577            // v6.1.1: `$N` parameter placeholder for the extended
578            // query protocol. PG numbers them 1..=N; we reject $0
579            // and a bare `$` not followed by a digit.
580            b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
581                let mut j = i + 1;
582                let mut n: u32 = 0;
583                while j < bytes.len() && bytes[j].is_ascii_digit() {
584                    n = n
585                        .saturating_mul(10)
586                        .saturating_add(u32::from(bytes[j] - b'0'));
587                    j += 1;
588                }
589                if n == 0 || n > u32::from(u16::MAX) {
590                    return Err(LexError {
591                        kind: LexErrorKind::BadNumber(input[i..j].to_string()),
592                        pos: i,
593                    });
594                }
595                #[allow(clippy::cast_possible_truncation)]
596                out.push(Token::Placeholder(n as u16));
597                i = j;
598            }
599            _ => {
600                let ch = input[i..].chars().next().unwrap_or('?');
601                return Err(LexError {
602                    kind: LexErrorKind::UnknownChar(ch),
603                    pos: i,
604                });
605            }
606        }
607    }
608    out.push(Token::Eof);
609    Ok(out)
610}
611
612fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
613    bytes.get(i) == Some(&target)
614}
615
616/// v7.14.0 — recognise the first byte of a MySQL session/user
617/// variable name (after `@` or `@@`). PG-strict idents are ASCII
618/// letter or underscore; MySQL also allows leading digits inside
619/// quoted names but unquoted vars match the same shape.
620fn is_session_var_ident_start(b: Option<u8>) -> bool {
621    matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
622}
623
624/// Continuation byte for a `@VAR`/`@@VAR` ident (after the first
625/// alphabet/underscore byte). Letters, digits, underscore, dot
626/// (MySQL allows session-scope qualifiers like
627/// `@@global.sql_mode`) and `$` (some MySQL versions accept it).
628fn is_session_var_ident_continue(b: u8) -> bool {
629    b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
630}
631
632/// v7.9.27 — find the start index of the next occurrence of `tag`
633/// (e.g. `b"$$"` or `b"$foo$"`) in `bytes` starting at `from`.
634fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
635    if tag.is_empty() || from > bytes.len() {
636        return None;
637    }
638    let mut i = from;
639    while i + tag.len() <= bytes.len() {
640        if &bytes[i..i + tag.len()] == tag {
641            return Some(i);
642        }
643        i += 1;
644    }
645    None
646}
647
648fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
649    bytes.get(i).is_some_and(pred)
650}
651
652fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
653    out.push(tok);
654    *i += 1;
655}
656
657/// Length-first ASCII-CI keyword lookup. Avoids allocating a
658/// lowercase `String` when the input matches a keyword; only the ident
659/// fall-through path pays for the lowercase copy.
660///
661/// Grouped by length so the outer `match` becomes a small jump table.
662/// Within a length bucket every keyword has either a unique first
663/// byte (cheap dispatch) or a small set of disambiguating
664/// trailing-byte comparisons. All comparisons are ASCII-CI (XOR
665/// 0x20 on each byte before the compare).
666fn keyword_or_ident_raw(raw: &str) -> Token {
667    let b = raw.as_bytes();
668    let tok = match b.len() {
669        2 => kw_len2(b),
670        3 => kw_len3(b),
671        4 => kw_len4(b),
672        5 => kw_len5(b),
673        6 => kw_len6(b),
674        7 => kw_len7(b),
675        8 => kw_len8(b),
676        9 => kw_len9(b),
677        10 => kw_len10(b),
678        11 => kw_len11(b),
679        12 => kw_len12(b),
680        _ => None,
681    };
682    match tok {
683        Some(t) => t,
684        // Ident fall-through: this is the only path that allocates.
685        None => Token::Ident(raw.to_ascii_lowercase()),
686    }
687}
688
689/// ASCII-CI equality on a byte slice against a lowercase literal.
690/// Letters that differ only in case satisfy `(a ^ b) == 0x20`; other
691/// mismatches set bits outside the 0x20 mask. We compare each byte
692/// against its lowercase form via `to_ascii_lowercase` for clarity;
693/// the compiler folds the loop into a tight cmov chain.
694#[inline]
695fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
696    if input.len() != lower.len() {
697        return false;
698    }
699    for i in 0..lower.len() {
700        if input[i].to_ascii_lowercase() != lower[i] {
701            return false;
702        }
703    }
704    true
705}
706
707#[inline]
708fn kw_len2(b: &[u8]) -> Option<Token> {
709    // 7 keywords: as, by, in, is, on, or, to
710    if eq_ci(b, b"as") {
711        return Some(Token::As);
712    }
713    if eq_ci(b, b"by") {
714        return Some(Token::By);
715    }
716    if eq_ci(b, b"in") {
717        return Some(Token::In);
718    }
719    if eq_ci(b, b"is") {
720        return Some(Token::Is);
721    }
722    if eq_ci(b, b"on") {
723        return Some(Token::On);
724    }
725    if eq_ci(b, b"or") {
726        return Some(Token::Or);
727    }
728    if eq_ci(b, b"to") {
729        return Some(Token::To);
730    }
731    None
732}
733
734#[inline]
735fn kw_len3(b: &[u8]) -> Option<Token> {
736    // 5 keywords: all, and, asc, not, for
737    if eq_ci(b, b"for") {
738        return Some(Token::For);
739    }
740    if eq_ci(b, b"all") {
741        return Some(Token::All);
742    }
743    if eq_ci(b, b"and") {
744        return Some(Token::And);
745    }
746    if eq_ci(b, b"asc") {
747        return Some(Token::Asc);
748    }
749    if eq_ci(b, b"not") {
750        return Some(Token::Not);
751    }
752    None
753}
754
755#[inline]
756fn kw_len4(b: &[u8]) -> Option<Token> {
757    // 10 keywords: from, null, true, into, like, join, left, show, desc, drop
758    if eq_ci(b, b"from") {
759        return Some(Token::From);
760    }
761    if eq_ci(b, b"drop") {
762        return Some(Token::Drop);
763    }
764    if eq_ci(b, b"null") {
765        return Some(Token::Null);
766    }
767    if eq_ci(b, b"true") {
768        return Some(Token::True);
769    }
770    if eq_ci(b, b"into") {
771        return Some(Token::Into);
772    }
773    if eq_ci(b, b"like") {
774        return Some(Token::Like);
775    }
776    if eq_ci(b, b"join") {
777        return Some(Token::Join);
778    }
779    if eq_ci(b, b"left") {
780        return Some(Token::Left);
781    }
782    if eq_ci(b, b"show") {
783        return Some(Token::Show);
784    }
785    if eq_ci(b, b"desc") {
786        return Some(Token::Desc);
787    }
788    None
789}
790
791#[inline]
792fn kw_len5(b: &[u8]) -> Option<Token> {
793    // 12 keywords: false, where, table, index, begin, order, limit,
794    // group, union, inner, cross, outer
795    if eq_ci(b, b"false") {
796        return Some(Token::False);
797    }
798    if eq_ci(b, b"where") {
799        return Some(Token::Where);
800    }
801    if eq_ci(b, b"table") {
802        return Some(Token::Table);
803    }
804    if eq_ci(b, b"index") {
805        return Some(Token::Index);
806    }
807    if eq_ci(b, b"begin") {
808        return Some(Token::Begin);
809    }
810    if eq_ci(b, b"order") {
811        return Some(Token::Order);
812    }
813    if eq_ci(b, b"limit") {
814        return Some(Token::Limit);
815    }
816    if eq_ci(b, b"group") {
817        return Some(Token::Group);
818    }
819    if eq_ci(b, b"union") {
820        return Some(Token::Union);
821    }
822    if eq_ci(b, b"inner") {
823        return Some(Token::Inner);
824    }
825    if eq_ci(b, b"cross") {
826        return Some(Token::Cross);
827    }
828    if eq_ci(b, b"outer") {
829        return Some(Token::Outer);
830    }
831    None
832}
833
834#[inline]
835fn kw_len6(b: &[u8]) -> Option<Token> {
836    // 9 keywords: select, create, insert, values, commit, having, offset, tables, except
837    if eq_ci(b, b"select") {
838        return Some(Token::Select);
839    }
840    if eq_ci(b, b"tables") {
841        return Some(Token::Tables);
842    }
843    if eq_ci(b, b"except") {
844        return Some(Token::Except);
845    }
846    if eq_ci(b, b"create") {
847        return Some(Token::Create);
848    }
849    if eq_ci(b, b"insert") {
850        return Some(Token::Insert);
851    }
852    if eq_ci(b, b"values") {
853        return Some(Token::Values);
854    }
855    if eq_ci(b, b"commit") {
856        return Some(Token::Commit);
857    }
858    if eq_ci(b, b"having") {
859        return Some(Token::Having);
860    }
861    if eq_ci(b, b"offset") {
862        return Some(Token::Offset);
863    }
864    None
865}
866
867#[inline]
868fn kw_len7(b: &[u8]) -> Option<Token> {
869    // 4 keywords: between, default, release, extract
870    if eq_ci(b, b"between") {
871        return Some(Token::Between);
872    }
873    if eq_ci(b, b"default") {
874        return Some(Token::Default);
875    }
876    if eq_ci(b, b"release") {
877        return Some(Token::Release);
878    }
879    if eq_ci(b, b"extract") {
880        return Some(Token::Extract);
881    }
882    None
883}
884
885#[inline]
886fn kw_len8(b: &[u8]) -> Option<Token> {
887    // 3 keywords: rollback, distinct, interval
888    if eq_ci(b, b"rollback") {
889        return Some(Token::Rollback);
890    }
891    if eq_ci(b, b"distinct") {
892        return Some(Token::Distinct);
893    }
894    if eq_ci(b, b"interval") {
895        return Some(Token::Interval);
896    }
897    None
898}
899
900#[inline]
901fn kw_len9(b: &[u8]) -> Option<Token> {
902    // 1 keyword: savepoint
903    if eq_ci(b, b"savepoint") {
904        return Some(Token::Savepoint);
905    }
906    None
907}
908
909#[inline]
910fn kw_len10(b: &[u8]) -> Option<Token> {
911    // 1 keyword: connection
912    if eq_ci(b, b"connection") {
913        return Some(Token::Connection);
914    }
915    None
916}
917
918#[inline]
919fn kw_len11(b: &[u8]) -> Option<Token> {
920    // 1 keyword: publication
921    if eq_ci(b, b"publication") {
922        return Some(Token::Publication);
923    }
924    None
925}
926
927#[inline]
928fn kw_len12(b: &[u8]) -> Option<Token> {
929    // 1 keyword: subscription
930    if eq_ci(b, b"subscription") {
931        return Some(Token::Subscription);
932    }
933    None
934}
935
936/// Lex a `'...'` string literal or `"..."` quoted identifier. The opening
937/// quote sits at `input[start]`; `quote` is its byte value. `is_ident` selects
938/// the resulting token shape.
939///
940/// PG-style doubling escapes the quote: `''` inside `'...'` is a literal `'`,
941/// same for `""` inside `"..."`.
942fn lex_quoted(
943    input: &str,
944    start: usize,
945    quote: u8,
946    is_ident: bool,
947) -> Result<(Token, usize), LexError> {
948    let bytes = input.as_bytes();
949    let mut i = start + 1;
950    let mut s = String::new();
951    loop {
952        if i >= bytes.len() {
953            return Err(LexError {
954                kind: if is_ident {
955                    LexErrorKind::UnterminatedQuotedIdent
956                } else {
957                    LexErrorKind::UnterminatedString
958                },
959                pos: start,
960            });
961        }
962        if bytes[i] == quote {
963            if peek_eq(bytes, i + 1, quote) {
964                s.push(quote as char);
965                i += 2;
966            } else {
967                i += 1;
968                break;
969            }
970        } else {
971            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
972            s.push(ch);
973            i += ch.len_utf8();
974        }
975    }
976    let tok = if is_ident {
977        Token::QuotedIdent(s)
978    } else {
979        Token::String(s)
980    };
981    Ok((tok, i - start))
982}
983
984/// v7.18 — Lex a PG escape-string literal `E'...'`. `start` points
985/// at the opening single quote (the `E` was matched by the caller
986/// and is NOT part of `start`'s offset semantics — the consumed
987/// count returned excludes the `E`, which the caller adds).
988///
989/// Recognised escape sequences:
990///   \\ \' \" — literal backslash / quote
991///   \n \r \t \b \f — standard whitespace controls
992///   \0 — NUL
993///   \xHH — single hex byte (1–2 hex digits)
994///   \NNN — octal byte (1–3 octal digits)
995/// Any other `\X` decodes to the literal byte `X` (PG warns; SPG
996/// follows the lenient behaviour pg_dump output relies on).
997///
998/// Doubled `''` is still a literal `'` (same as the non-E form).
999fn lex_escape_string(input: &str, start: usize) -> Result<(Token, usize), LexError> {
1000    let bytes = input.as_bytes();
1001    debug_assert_eq!(bytes[start], b'\'');
1002    let mut i = start + 1;
1003    let mut s = String::new();
1004    loop {
1005        if i >= bytes.len() {
1006            return Err(LexError {
1007                kind: LexErrorKind::UnterminatedString,
1008                pos: start,
1009            });
1010        }
1011        let b = bytes[i];
1012        if b == b'\'' {
1013            if peek_eq(bytes, i + 1, b'\'') {
1014                s.push('\'');
1015                i += 2;
1016                continue;
1017            }
1018            i += 1;
1019            break;
1020        }
1021        if b == b'\\' && i + 1 < bytes.len() {
1022            let n = bytes[i + 1];
1023            match n {
1024                b'\\' => {
1025                    s.push('\\');
1026                    i += 2;
1027                }
1028                b'\'' => {
1029                    s.push('\'');
1030                    i += 2;
1031                }
1032                b'"' => {
1033                    s.push('"');
1034                    i += 2;
1035                }
1036                b'n' => {
1037                    s.push('\n');
1038                    i += 2;
1039                }
1040                b'r' => {
1041                    s.push('\r');
1042                    i += 2;
1043                }
1044                b't' => {
1045                    s.push('\t');
1046                    i += 2;
1047                }
1048                b'b' => {
1049                    s.push('\u{0008}');
1050                    i += 2;
1051                }
1052                b'f' => {
1053                    s.push('\u{000C}');
1054                    i += 2;
1055                }
1056                b'0' if i + 2 >= bytes.len() || !bytes[i + 2].is_ascii_digit() => {
1057                    s.push('\0');
1058                    i += 2;
1059                }
1060                b'x' => {
1061                    // \xH or \xHH — single byte by hex.
1062                    let h1 = bytes.get(i + 2).copied();
1063                    let h2 = bytes.get(i + 3).copied();
1064                    let n1 = h1.and_then(hex_digit_value);
1065                    let n2 = h2.and_then(hex_digit_value);
1066                    match (n1, n2) {
1067                        (Some(a), Some(b2)) => {
1068                            s.push((((a << 4) | b2) as u8) as char);
1069                            i += 4;
1070                        }
1071                        (Some(a), _) => {
1072                            s.push((a as u8) as char);
1073                            i += 3;
1074                        }
1075                        _ => {
1076                            // \x with no hex follows — literal x.
1077                            s.push('x');
1078                            i += 2;
1079                        }
1080                    }
1081                }
1082                d if d.is_ascii_digit() && d < b'8' => {
1083                    // \NNN octal — up to 3 octal digits.
1084                    let mut value: u32 = u32::from(d - b'0');
1085                    let mut take = 2;
1086                    while take < 4 {
1087                        let next = bytes.get(i + take).copied();
1088                        match next {
1089                            Some(c) if c.is_ascii_digit() && c < b'8' => {
1090                                value = (value << 3) | u32::from(c - b'0');
1091                                take += 1;
1092                            }
1093                            _ => break,
1094                        }
1095                    }
1096                    if let Some(c) = char::from_u32(value) {
1097                        s.push(c);
1098                    } else {
1099                        // Invalid Unicode — preserve as raw byte char.
1100                        s.push((value & 0xFF) as u8 as char);
1101                    }
1102                    i += take;
1103                }
1104                other => {
1105                    // Lenient fallback — same as PG with
1106                    // `standard_conforming_strings = off` warning:
1107                    // decode `\X` to literal `X`.
1108                    s.push(other as char);
1109                    i += 2;
1110                }
1111            }
1112        } else {
1113            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1114            s.push(ch);
1115            i += ch.len_utf8();
1116        }
1117    }
1118    Ok((Token::String(s), i - start))
1119}
1120
1121fn hex_digit_value(b: u8) -> Option<u32> {
1122    match b {
1123        b'0'..=b'9' => Some(u32::from(b - b'0')),
1124        b'a'..=b'f' => Some(u32::from(b - b'a' + 10)),
1125        b'A'..=b'F' => Some(u32::from(b - b'A' + 10)),
1126        _ => None,
1127    }
1128}
1129
1130fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
1131    let bytes = s.as_bytes();
1132    let mut i = 0usize;
1133    let mut is_float = false;
1134
1135    while i < bytes.len() && bytes[i].is_ascii_digit() {
1136        i += 1;
1137    }
1138    if i < bytes.len() && bytes[i] == b'.' {
1139        is_float = true;
1140        i += 1;
1141        while i < bytes.len() && bytes[i].is_ascii_digit() {
1142            i += 1;
1143        }
1144    }
1145    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
1146        is_float = true;
1147        i += 1;
1148        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
1149            i += 1;
1150        }
1151        let exp_start = i;
1152        while i < bytes.len() && bytes[i].is_ascii_digit() {
1153            i += 1;
1154        }
1155        if exp_start == i {
1156            return Err(LexErrorKind::BadNumber(s[..i].to_string()));
1157        }
1158    }
1159
1160    let lit = &s[..i];
1161    if is_float {
1162        lit.parse::<f64>()
1163            .map(|v| (Token::Float(v), i))
1164            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1165    } else {
1166        lit.parse::<i64>()
1167            .map(|v| (Token::Integer(v), i))
1168            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1169    }
1170}
1171
1172#[cfg(test)]
1173mod tests {
1174    use super::*;
1175    use alloc::vec;
1176
1177    fn lex(s: &str) -> Vec<Token> {
1178        tokenize(s).expect("lex ok")
1179    }
1180
1181    #[test]
1182    fn empty_yields_only_eof() {
1183        assert_eq!(lex(""), vec![Token::Eof]);
1184    }
1185
1186    #[test]
1187    fn whitespace_only_yields_only_eof() {
1188        assert_eq!(lex("   \t\n  "), vec![Token::Eof]);
1189    }
1190
1191    #[test]
1192    fn keywords_are_case_insensitive() {
1193        assert_eq!(
1194            lex("SELECT select Select"),
1195            vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1196        );
1197    }
1198
1199    #[test]
1200    fn identifiers_lowercase_ascii() {
1201        assert_eq!(
1202            lex("hello WORLD _x x1"),
1203            vec![
1204                Token::Ident("hello".into()),
1205                Token::Ident("world".into()),
1206                Token::Ident("_x".into()),
1207                Token::Ident("x1".into()),
1208                Token::Eof,
1209            ]
1210        );
1211    }
1212
1213    #[test]
1214    fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1215        assert_eq!(
1216            lex(r#""User Name" "a""b""#),
1217            vec![
1218                Token::QuotedIdent("User Name".into()),
1219                Token::QuotedIdent("a\"b".into()),
1220                Token::Eof,
1221            ]
1222        );
1223    }
1224
1225    #[test]
1226    fn integer_and_float_literals() {
1227        assert_eq!(
1228            lex("0 42 1.5 .5 1e10 2.5e-3"),
1229            vec![
1230                Token::Integer(0),
1231                Token::Integer(42),
1232                Token::Float(1.5),
1233                Token::Float(0.5),
1234                Token::Float(1e10),
1235                Token::Float(2.5e-3),
1236                Token::Eof,
1237            ]
1238        );
1239    }
1240
1241    #[test]
1242    fn negative_number_is_minus_then_integer() {
1243        // PG follows this: unary minus is a separate token, parser folds it.
1244        assert_eq!(
1245            lex("-42"),
1246            vec![Token::Minus, Token::Integer(42), Token::Eof]
1247        );
1248    }
1249
1250    #[test]
1251    fn string_literal_doubled_quote_escape() {
1252        assert_eq!(
1253            lex("'hello' 'it''s'"),
1254            vec![
1255                Token::String("hello".into()),
1256                Token::String("it's".into()),
1257                Token::Eof,
1258            ]
1259        );
1260    }
1261
1262    #[test]
1263    fn all_comparison_and_arithmetic_operators() {
1264        assert_eq!(
1265            lex("= <> != < <= > >= + - * /"),
1266            vec![
1267                Token::Eq,
1268                Token::NotEq,
1269                Token::NotEq,
1270                Token::Lt,
1271                Token::LtEq,
1272                Token::Gt,
1273                Token::GtEq,
1274                Token::Plus,
1275                Token::Minus,
1276                Token::Star,
1277                Token::Slash,
1278                Token::Eof,
1279            ]
1280        );
1281    }
1282
1283    #[test]
1284    fn punctuation() {
1285        assert_eq!(
1286            lex("( ) , ; ."),
1287            vec![
1288                Token::LParen,
1289                Token::RParen,
1290                Token::Comma,
1291                Token::Semicolon,
1292                Token::Dot,
1293                Token::Eof,
1294            ]
1295        );
1296    }
1297
1298    #[test]
1299    fn line_comment_skipped() {
1300        assert_eq!(
1301            lex("SELECT -- trailing junk\nFROM"),
1302            vec![Token::Select, Token::From, Token::Eof]
1303        );
1304    }
1305
1306    #[test]
1307    fn block_comment_skipped() {
1308        assert_eq!(
1309            lex("SELECT /* skipped */ 1"),
1310            vec![Token::Select, Token::Integer(1), Token::Eof]
1311        );
1312    }
1313
1314    #[test]
1315    fn unterminated_string_errors() {
1316        let err = tokenize("'oops").unwrap_err();
1317        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1318        assert_eq!(err.pos, 0);
1319    }
1320
1321    #[test]
1322    fn unterminated_block_comment_errors() {
1323        let err = tokenize("/* never closed").unwrap_err();
1324        assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1325    }
1326
1327    #[test]
1328    fn unknown_char_errors() {
1329        // v7.17.0 Phase 2.6 — `@` standalone now lexes as
1330        // Token::At (mysqldump `'user'@'host'` DEFINER stitching).
1331        // Use `?` for the unknown-char regression; PG `?` operator
1332        // family is parsed as JSON ops in the prefix `?` shape
1333        // would land in lex paths; bare `?` is unknown.
1334        let err = tokenize("\x07").unwrap_err();
1335        assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1336    }
1337
1338    #[test]
1339    fn at_alone_lexes_as_punctuation() {
1340        // v7.17.0 Phase 2.6 — the `'user'@'host'` MySQL DEFINER
1341        // form needs `@` to lex as a standalone token.
1342        assert_eq!(
1343            lex("'u'@'h'"),
1344            vec![
1345                Token::String("u".into()),
1346                Token::At,
1347                Token::String("h".into()),
1348                Token::Eof,
1349            ]
1350        );
1351    }
1352
1353    #[test]
1354    fn dot_in_qualified_column() {
1355        assert_eq!(
1356            lex("t.col"),
1357            vec![
1358                Token::Ident("t".into()),
1359                Token::Dot,
1360                Token::Ident("col".into()),
1361                Token::Eof,
1362            ]
1363        );
1364    }
1365
1366    // --- v0.11 brackets + distance op + vector keyword --------------------
1367
1368    #[test]
1369    fn brackets_are_distinct_tokens() {
1370        assert_eq!(
1371            lex("[ ]"),
1372            vec![Token::LBracket, Token::RBracket, Token::Eof]
1373        );
1374    }
1375
1376    #[test]
1377    fn l2_distance_is_three_char_token() {
1378        assert_eq!(
1379            lex("a <-> b"),
1380            vec![
1381                Token::Ident("a".into()),
1382                Token::L2Distance,
1383                Token::Ident("b".into()),
1384                Token::Eof,
1385            ]
1386        );
1387        // Bare `<-` should NOT match L2Distance.
1388        assert_eq!(
1389            lex("a <- b"),
1390            vec![
1391                Token::Ident("a".into()),
1392                Token::Lt,
1393                Token::Minus,
1394                Token::Ident("b".into()),
1395                Token::Eof,
1396            ]
1397        );
1398    }
1399
1400    #[test]
1401    fn order_by_limit_are_keywords() {
1402        assert_eq!(
1403            lex("ORDER BY LIMIT"),
1404            vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1405        );
1406    }
1407
1408    // --- v1.2: pgvector distance ops + PG cast --------------------------
1409
1410    #[test]
1411    fn inner_product_operator_3char() {
1412        assert_eq!(
1413            lex("a <#> b"),
1414            vec![
1415                Token::Ident("a".into()),
1416                Token::InnerProduct,
1417                Token::Ident("b".into()),
1418                Token::Eof,
1419            ]
1420        );
1421    }
1422
1423    #[test]
1424    fn cosine_distance_operator_3char() {
1425        assert_eq!(
1426            lex("a <=> b"),
1427            vec![
1428                Token::Ident("a".into()),
1429                Token::CosineDistance,
1430                Token::Ident("b".into()),
1431                Token::Eof,
1432            ]
1433        );
1434        // Make sure `<=` and `<>` and `<->` still lex right when `<=>` is
1435        // around (greedy match takes the longest).
1436        assert_eq!(
1437            lex("a <= b"),
1438            vec![
1439                Token::Ident("a".into()),
1440                Token::LtEq,
1441                Token::Ident("b".into()),
1442                Token::Eof,
1443            ]
1444        );
1445    }
1446
1447    #[test]
1448    fn double_colon_cast_token() {
1449        assert_eq!(
1450            lex("x::INT"),
1451            vec![
1452                Token::Ident("x".into()),
1453                Token::DoubleColon,
1454                Token::Ident("int".into()),
1455                Token::Eof,
1456            ]
1457        );
1458    }
1459
1460    #[test]
1461    fn lone_single_colon_lexes_as_colon_token() {
1462        // v7.12.4 — single `:` is now a token (PL/pgSQL surface
1463        // + tsvector external-form literal both need it). The
1464        // pre-v7.12.4 "single colon = unknown char" behaviour
1465        // was incidental.
1466        let toks = tokenize(":x").expect("colon now lexes");
1467        assert_eq!(toks[0], Token::Colon);
1468    }
1469
1470    #[test]
1471    fn colon_eq_lexes_as_assignment() {
1472        // v7.12.4 — PL/pgSQL assignment operator.
1473        let toks = tokenize("x := 1").expect("colon-eq lexes");
1474        // Tokens: Ident("x"), ColonEq, NumberLiteral
1475        assert!(matches!(toks[1], Token::ColonEq));
1476    }
1477
1478    #[test]
1479    fn pg_escape_string_double_backslash_decodes_to_single() {
1480        // v7.18 — E'\\xdeadbeef' decodes to literal `\xdeadbeef`
1481        // (10 chars: backslash + xdeadbeef). The downstream
1482        // `::bytea` cast then reads that as the PG hex-form bytea
1483        // literal. mailrs D-pre #3.
1484        let toks = tokenize(r"E'\\xdeadbeef'").expect("E-string lexes");
1485        assert_eq!(toks, vec![Token::String(r"\xdeadbeef".into()), Token::Eof]);
1486    }
1487
1488    #[test]
1489    fn pg_escape_string_supports_basic_escapes() {
1490        // \n / \t / \' / \\ — the PG standard set.
1491        let toks = tokenize(r"E'a\nb\tc\'d\\e'").expect("E-string lexes");
1492        assert_eq!(toks, vec![Token::String("a\nb\tc'd\\e".into()), Token::Eof]);
1493    }
1494
1495    #[test]
1496    fn pg_escape_string_hex_byte() {
1497        // \xHH single byte. \x41 = 'A'.
1498        let toks = tokenize(r"E'\x41B\x42'").expect("E-string lexes");
1499        assert_eq!(toks, vec![Token::String("ABB".into()), Token::Eof]);
1500    }
1501
1502    #[test]
1503    fn pg_escape_string_lowercase_e_prefix() {
1504        let toks = tokenize(r"e'hi\n'").expect("e-string lexes");
1505        assert_eq!(toks, vec![Token::String("hi\n".into()), Token::Eof]);
1506    }
1507
1508    #[test]
1509    fn pg_escape_string_doubled_quote() {
1510        // Even in E-string the doubled '' is a literal '.
1511        let toks = tokenize(r"E'it''s ok'").expect("E-string lexes");
1512        assert_eq!(toks, vec![Token::String("it's ok".into()), Token::Eof]);
1513    }
1514}
spg_sql/lexer.rs

spg_sql/
lexer.rs