spg_sql/
lexer.rs

1//! Lexer for the PG-dialect subset that SPG accepts.
2//!
3//! v0.2 token stream is value-only — no source spans yet. Errors do report
4//! the byte offset where the offending construct started. Identifiers are
5//! ASCII case-folded to lower-case (matches PG when un-quoted). Quoted
6//! identifiers (`"..."`) preserve case; `""` is an embedded quote.
7//! String literals (`'...'`) follow PG single-quote convention with `''`
8//! as the embedded quote. The lexer accepts but does not interpret E-strings
9//! or dollar-quoted strings — those land in a later milestone.
10
11use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords
18    Select,
19    From,
20    Where,
21    As,
22    Null,
23    True,
24    False,
25    And,
26    Or,
27    Not,
28    Create,
29    Table,
30    Insert,
31    Into,
32    Values,
33    Index,
34    On,
35    Begin,
36    Commit,
37    Rollback,
38    Order,
39    By,
40    Limit,
41
42    // Identifiers
43    Ident(String),       // ASCII case-folded
44    QuotedIdent(String), // original case, "" → "
45    /// v7.14.0 — MySQL session / user variable reference
46    /// (`@VAR` / `@@VAR`). The wrapped string is the verbatim
47    /// source form (including the `@` / `@@` prefix). Used by
48    /// mysqldump preamble (`SET @OLD_FOREIGN_KEY_CHECKS =
49    /// @@FOREIGN_KEY_CHECKS, …`); SPG accepts the token and
50    /// the SET parser treats the assignment as a no-op apart
51    /// from any second LHS that targets a real session
52    /// parameter (e.g. `FOREIGN_KEY_CHECKS=0`).
53    SessionVar(String),
54
55    // Literals
56    Integer(i64),
57    Float(f64),
58    String(String),
59
60    // Operators
61    Plus,
62    Minus,
63    Star,
64    Slash,
65    Eq,
66    NotEq,
67    Lt,
68    LtEq,
69    Gt,
70    GtEq,
71
72    // Punctuation
73    LParen,
74    RParen,
75    LBracket,
76    RBracket,
77    Comma,
78    Semicolon,
79    Dot,
80    /// pgvector L2 distance operator `<->`. Lexed as one token so the
81    /// parser can give it its own precedence rung.
82    /// v4.14 `->` — JSON object/array element access, returns json.
83    JsonGet,
84    /// v4.14 `->>` — same access, returns text.
85    JsonGetText,
86    /// v6.4.5 `#>` — JSON path walk, returns json. Path is the
87    /// right-hand TEXT with PG `{a,b,0}` syntax.
88    JsonGetPath,
89    /// v6.4.5 `#>>` — same walk, returns text.
90    JsonGetPathText,
91    /// v6.4.5 `@>` — JSON containment. `j @> sub` returns true if
92    /// every key/value in `sub` is present in `j` with structural
93    /// containment for objects + arrays.
94    JsonContains,
95    /// v7.12.2 `@@` — tsvector / tsquery match. Either ordering
96    /// (`vec @@ q` or `q @@ vec`) parses; engine eval normalises
97    /// before matching.
98    TsMatch,
99    L2Distance,
100    /// pgvector inner-product operator `<#>` (returns negative dot product
101    /// so smaller still means more similar — same semantics as pgvector).
102    InnerProduct,
103    /// pgvector cosine distance operator `<=>`.
104    CosineDistance,
105    /// PG-style cast `expr::type` — single token because we want it to bind
106    /// at postfix precedence.
107    DoubleColon,
108    /// v7.12.4 — PL/pgSQL assignment operator `:=`.
109    /// Outside PL/pgSQL bodies this token has no SQL-side meaning.
110    ColonEq,
111    /// v7.12.4 — bare `:` separator. Used inside `tsvector` external-form
112    /// literals (`'cat:1 dog:2'::tsvector`) and as the fallback path for
113    /// the PL/pgSQL assignment lexer.
114    Colon,
115    /// Standard SQL string concatenation `||`.
116    Concat,
117    /// `IS` keyword — postfix `IS NULL` / `IS NOT NULL` predicates.
118    Is,
119    Between,
120    In,
121    Like,
122    Group,
123    Distinct,
124    Union,
125    All,
126    Join,
127    Inner,
128    Left,
129    Cross,
130    Outer,
131    Default,
132    Savepoint,
133    Release,
134    To,
135    Having,
136    Show,
137    Extract,
138    Offset,
139    Asc,
140    Desc,
141    /// `INTERVAL` — followed by a string literal carrying the span text
142    /// (e.g. `INTERVAL '1 day 2 hours'`).
143    Interval,
144    /// v6.1.1 — `$N` parameter placeholder for the extended query
145    /// protocol. The number N is 1-based per PostgreSQL convention.
146    /// `0` and `$0` are not valid; the lexer rejects them.
147    Placeholder(u16),
148
149    /// v6.1.2 — `DROP` keyword. Used by `DROP PUBLICATION <name>`.
150    /// Reserved for future `DROP TABLE` / `DROP INDEX` / `DROP USER`
151    /// surface that currently goes through SHOW-shaped admin SQL.
152    Drop,
153    /// v6.1.2 — `FOR` keyword (publication scope).
154    For,
155    /// v6.1.2 — `TABLES` plural keyword (`FOR ALL TABLES`,
156    /// `FOR ALL TABLES EXCEPT …`). The existing `TABLE` keyword
157    /// stays a separate token so `CREATE TABLE`'s single-table
158    /// form keeps lexing as today.
159    Tables,
160    /// v6.1.3 (reserved at v6.1.2 to keep the AST shape stable) —
161    /// `EXCEPT` keyword for `FOR ALL TABLES EXCEPT t1, t2`.
162    Except,
163    /// v6.1.2 — `PUBLICATION` keyword.
164    Publication,
165    /// v6.1.4 (reserved at v6.1.2) — `SUBSCRIPTION` keyword.
166    Subscription,
167    /// v6.1.4 — `CONNECTION` keyword (for
168    /// `CREATE SUBSCRIPTION … CONNECTION '<conn_str>' …`).
169    Connection,
170
171    Eof,
172}
173
174#[derive(Debug, Clone, PartialEq, Eq)]
175pub enum LexErrorKind {
176    UnknownChar(char),
177    UnterminatedString,
178    UnterminatedQuotedIdent,
179    UnterminatedBlockComment,
180    BadNumber(String),
181}
182
183#[derive(Debug, Clone, PartialEq, Eq)]
184pub struct LexError {
185    pub kind: LexErrorKind,
186    pub pos: usize,
187}
188
189impl fmt::Display for LexError {
190    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
191        match &self.kind {
192            LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
193            LexErrorKind::UnterminatedString => {
194                write!(f, "unterminated string literal at byte {}", self.pos)
195            }
196            LexErrorKind::UnterminatedQuotedIdent => {
197                write!(f, "unterminated quoted identifier at byte {}", self.pos)
198            }
199            LexErrorKind::UnterminatedBlockComment => {
200                write!(f, "unterminated /* */ comment at byte {}", self.pos)
201            }
202            LexErrorKind::BadNumber(s) => {
203                write!(f, "invalid number literal {s:?} at byte {}", self.pos)
204            }
205        }
206    }
207}
208
209/// Tokenize `input` into a `Vec<Token>` ending in `Token::Eof`.
210#[allow(clippy::too_many_lines)] // big match — splitting would obscure the dispatch table
211pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
212    let bytes = input.as_bytes();
213    let mut i = 0usize;
214    let mut out = Vec::new();
215
216    while i < bytes.len() {
217        let b = bytes[i];
218        match b {
219            b' ' | b'\t' | b'\n' | b'\r' => {
220                i += 1;
221            }
222            b'-' if peek_eq(bytes, i + 1, b'-') => {
223                i += 2;
224                while i < bytes.len() && bytes[i] != b'\n' {
225                    i += 1;
226                }
227            }
228            b'/' if peek_eq(bytes, i + 1, b'*') => {
229                let start = i;
230                // v7.14.0 — MySQL versioned conditional comment
231                // `/*!NNNNN <body> */`. The body is real SQL that
232                // MySQL/MariaDB executes when the runtime version
233                // matches the 5-digit code; PG strips the whole
234                // thing as a block comment. SPG sides with MySQL
235                // semantics for dump compatibility: skip the
236                // `/*!NNNNN ` prefix and continue lexing the body
237                // as ordinary tokens. The closing `*/` is later
238                // matched + skipped by the symmetric arm below.
239                if peek_eq(bytes, i + 2, b'!') {
240                    let mut j = i + 3;
241                    // skip the optional 5-digit version code +
242                    // following single whitespace
243                    while j < bytes.len() && bytes[j].is_ascii_digit() {
244                        j += 1;
245                    }
246                    if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
247                        j += 1;
248                    }
249                    i = j;
250                    continue;
251                }
252                i += 2;
253                let mut closed = false;
254                while i + 1 < bytes.len() {
255                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
256                        i += 2;
257                        closed = true;
258                        break;
259                    }
260                    i += 1;
261                }
262                if !closed {
263                    return Err(LexError {
264                        kind: LexErrorKind::UnterminatedBlockComment,
265                        pos: start,
266                    });
267                }
268            }
269            // v7.14.0 — bare `*/` (closing of the v7.14 MySQL
270            // versioned-comment opener that didn't consume the
271            // closer). We treat it as an inline comment terminator
272            // and skip 2 bytes.
273            b'*' if peek_eq(bytes, i + 1, b'/') => {
274                i += 2;
275            }
276            b'\'' => {
277                let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
278                out.push(tok);
279                i += consumed;
280            }
281            b'"' => {
282                let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
283                out.push(tok);
284                i += consumed;
285            }
286            // MySQL-flavoured backtick-quoted identifier. Same semantics
287            // as the standard `"..."` form, including embedded "``" as
288            // a literal backtick.
289            b'`' => {
290                let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
291                out.push(tok);
292                i += consumed;
293            }
294            b if b.is_ascii_alphabetic() || b == b'_' => {
295                let start = i;
296                i += 1;
297                while i < bytes.len() {
298                    let c = bytes[i];
299                    if c.is_ascii_alphanumeric() || c == b'_' {
300                        i += 1;
301                    } else {
302                        break;
303                    }
304                }
305                let raw = &input[start..i];
306                // v3.0.5: try the keyword table case-insensitively
307                // without allocating; only the ident fall-through
308                // pays for a lowercase String.
309                out.push(keyword_or_ident_raw(raw));
310            }
311            b if b.is_ascii_digit() => {
312                let (tok, consumed) =
313                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
314                out.push(tok);
315                i += consumed;
316            }
317            b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
318                let (tok, consumed) =
319                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
320                out.push(tok);
321                i += consumed;
322            }
323            b'+' => single(&mut out, Token::Plus, &mut i),
324            b'-' => {
325                // v4.14: `->>` and `->` for JSON path access. `->>`
326                // must be tried before `->` (longest match).
327                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
328                    out.push(Token::JsonGetText);
329                    i += 3;
330                } else if peek_eq(bytes, i + 1, b'>') {
331                    out.push(Token::JsonGet);
332                    i += 2;
333                } else {
334                    single(&mut out, Token::Minus, &mut i);
335                }
336            }
337            // v6.4.5: `#>>` and `#>` JSON path walk.
338            b'#' => {
339                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
340                    out.push(Token::JsonGetPathText);
341                    i += 3;
342                } else if peek_eq(bytes, i + 1, b'>') {
343                    out.push(Token::JsonGetPath);
344                    i += 2;
345                } else {
346                    return Err(LexError {
347                        kind: LexErrorKind::UnknownChar('#'),
348                        pos: i,
349                    });
350                }
351            }
352            // v6.4.5: `@>` JSON containment.
353            // v7.12.2: `@@` tsvector / tsquery match.
354            // v7.14.0: `@@NAME` MySQL session variable ref +
355            //          `@NAME` user variable ref. mysqldump preamble
356            //          uses both heavily (`SET @OLD_FOREIGN_KEY_CHECKS
357            //          = @@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0`).
358            //          We lex both as a single SessionVar token so
359            //          the parser can accept and ignore them.
360            b'@' => {
361                if peek_eq(bytes, i + 1, b'>') {
362                    out.push(Token::JsonContains);
363                    i += 2;
364                } else if peek_eq(bytes, i + 1, b'@')
365                    && !is_session_var_ident_start(bytes.get(i + 2).copied())
366                {
367                    // `@@` not followed by an ident-start byte is
368                    // the tsquery `@@` operator.
369                    out.push(Token::TsMatch);
370                    i += 2;
371                } else {
372                    // `@VAR` / `@@VAR` — MySQL user / session
373                    // variable reference. Consume the ident-shaped
374                    // tail and emit as Token::SessionVar so the
375                    // SET parser can accept-and-ignore.
376                    let prefix_end = if peek_eq(bytes, i + 1, b'@') { i + 2 } else { i + 1 };
377                    let mut end = prefix_end;
378                    while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
379                        end += 1;
380                    }
381                    if end == prefix_end {
382                        return Err(LexError {
383                            kind: LexErrorKind::UnknownChar('@'),
384                            pos: i,
385                        });
386                    }
387                    out.push(Token::SessionVar(input[i..end].to_string()));
388                    i = end;
389                }
390            }
391            b'*' => single(&mut out, Token::Star, &mut i),
392            b'/' => single(&mut out, Token::Slash, &mut i),
393            b'(' => single(&mut out, Token::LParen, &mut i),
394            b')' => single(&mut out, Token::RParen, &mut i),
395            b'[' => single(&mut out, Token::LBracket, &mut i),
396            b']' => single(&mut out, Token::RBracket, &mut i),
397            b',' => single(&mut out, Token::Comma, &mut i),
398            b';' => single(&mut out, Token::Semicolon, &mut i),
399            b'.' => single(&mut out, Token::Dot, &mut i),
400            b'=' => single(&mut out, Token::Eq, &mut i),
401            b'<' => {
402                if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
403                    out.push(Token::CosineDistance);
404                    i += 3;
405                } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
406                    out.push(Token::InnerProduct);
407                    i += 3;
408                } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
409                    out.push(Token::L2Distance);
410                    i += 3;
411                } else if peek_eq(bytes, i + 1, b'=') {
412                    out.push(Token::LtEq);
413                    i += 2;
414                } else if peek_eq(bytes, i + 1, b'>') {
415                    out.push(Token::NotEq);
416                    i += 2;
417                } else {
418                    out.push(Token::Lt);
419                    i += 1;
420                }
421            }
422            b':' if peek_eq(bytes, i + 1, b':') => {
423                out.push(Token::DoubleColon);
424                i += 2;
425            }
426            b':' if peek_eq(bytes, i + 1, b'=') => {
427                // v7.12.4 — PL/pgSQL assignment operator `:=`.
428                out.push(Token::ColonEq);
429                i += 2;
430            }
431            b':' => {
432                // v7.12.4 — bare `:`. Used inside `tsvector` external-form
433                // literals which the cast parser consumes in-token, and as a
434                // separator the PL/pgSQL assignment lexer can recover from.
435                out.push(Token::Colon);
436                i += 1;
437            }
438            b'|' if peek_eq(bytes, i + 1, b'|') => {
439                out.push(Token::Concat);
440                i += 2;
441            }
442            b'>' => {
443                if peek_eq(bytes, i + 1, b'=') {
444                    out.push(Token::GtEq);
445                    i += 2;
446                } else {
447                    out.push(Token::Gt);
448                    i += 1;
449                }
450            }
451            b'!' if peek_eq(bytes, i + 1, b'=') => {
452                out.push(Token::NotEq);
453                i += 2;
454            }
455            // v7.9.27 — PG dollar-quoted string `$$ … $$` (or
456            // `$tag$ … $tag$`). Used in `DO $$ … $$ LANGUAGE
457            // plpgsql;` blocks that pg_dump emits for idempotent
458            // migrations. SPG has no PL/pgSQL, so the lexer
459            // consumes the entire string as a single Token::String
460            // and the parser treats the surrounding `DO …;` as a
461            // no-op. mailrs follow-up H1.
462            b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
463                // Empty tag form: `$$ … $$`.
464                let end = find_dollar_tag_end(bytes, i + 2, b"$$");
465                let body = match end {
466                    Some(e) => &input[i + 2..e],
467                    None => {
468                        return Err(LexError {
469                            kind: LexErrorKind::UnterminatedString,
470                            pos: i,
471                        });
472                    }
473                };
474                out.push(Token::String(body.to_string()));
475                i = end.unwrap() + 2;
476            }
477            b'$' if i + 1 < bytes.len()
478                && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
479            {
480                // Tagged form: `$foo$ … $foo$`. Scan the tag
481                // ident, find the closing copy.
482                let mut j = i + 1;
483                while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
484                    j += 1;
485                }
486                if j >= bytes.len() || bytes[j] != b'$' {
487                    // Not a dollar-quoted string — fall through
488                    // to the generic-unknown-char path.
489                    let ch = input[i..].chars().next().unwrap_or('?');
490                    return Err(LexError {
491                        kind: LexErrorKind::UnknownChar(ch),
492                        pos: i,
493                    });
494                }
495                let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
496                let end = find_dollar_tag_end(bytes, j + 1, &close);
497                let body = match end {
498                    Some(e) => &input[j + 1..e],
499                    None => {
500                        return Err(LexError {
501                            kind: LexErrorKind::UnterminatedString,
502                            pos: i,
503                        });
504                    }
505                };
506                out.push(Token::String(body.to_string()));
507                i = end.unwrap() + close.len();
508            }
509            // v6.1.1: `$N` parameter placeholder for the extended
510            // query protocol. PG numbers them 1..=N; we reject $0
511            // and a bare `$` not followed by a digit.
512            b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
513                let mut j = i + 1;
514                let mut n: u32 = 0;
515                while j < bytes.len() && bytes[j].is_ascii_digit() {
516                    n = n
517                        .saturating_mul(10)
518                        .saturating_add(u32::from(bytes[j] - b'0'));
519                    j += 1;
520                }
521                if n == 0 || n > u32::from(u16::MAX) {
522                    return Err(LexError {
523                        kind: LexErrorKind::BadNumber(input[i..j].to_string()),
524                        pos: i,
525                    });
526                }
527                #[allow(clippy::cast_possible_truncation)]
528                out.push(Token::Placeholder(n as u16));
529                i = j;
530            }
531            _ => {
532                let ch = input[i..].chars().next().unwrap_or('?');
533                return Err(LexError {
534                    kind: LexErrorKind::UnknownChar(ch),
535                    pos: i,
536                });
537            }
538        }
539    }
540    out.push(Token::Eof);
541    Ok(out)
542}
543
544fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
545    bytes.get(i) == Some(&target)
546}
547
548/// v7.14.0 — recognise the first byte of a MySQL session/user
549/// variable name (after `@` or `@@`). PG-strict idents are ASCII
550/// letter or underscore; MySQL also allows leading digits inside
551/// quoted names but unquoted vars match the same shape.
552fn is_session_var_ident_start(b: Option<u8>) -> bool {
553    matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
554}
555
556/// Continuation byte for a `@VAR`/`@@VAR` ident (after the first
557/// alphabet/underscore byte). Letters, digits, underscore, dot
558/// (MySQL allows session-scope qualifiers like
559/// `@@global.sql_mode`) and `$` (some MySQL versions accept it).
560fn is_session_var_ident_continue(b: u8) -> bool {
561    b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
562}
563
564/// v7.9.27 — find the start index of the next occurrence of `tag`
565/// (e.g. `b"$$"` or `b"$foo$"`) in `bytes` starting at `from`.
566fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
567    if tag.is_empty() || from > bytes.len() {
568        return None;
569    }
570    let mut i = from;
571    while i + tag.len() <= bytes.len() {
572        if &bytes[i..i + tag.len()] == tag {
573            return Some(i);
574        }
575        i += 1;
576    }
577    None
578}
579
580fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
581    bytes.get(i).is_some_and(pred)
582}
583
584fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
585    out.push(tok);
586    *i += 1;
587}
588
589/// Length-first ASCII-CI keyword lookup. Avoids allocating a
590/// lowercase `String` when the input matches a keyword; only the ident
591/// fall-through path pays for the lowercase copy.
592///
593/// Grouped by length so the outer `match` becomes a small jump table.
594/// Within a length bucket every keyword has either a unique first
595/// byte (cheap dispatch) or a small set of disambiguating
596/// trailing-byte comparisons. All comparisons are ASCII-CI (XOR
597/// 0x20 on each byte before the compare).
598fn keyword_or_ident_raw(raw: &str) -> Token {
599    let b = raw.as_bytes();
600    let tok = match b.len() {
601        2 => kw_len2(b),
602        3 => kw_len3(b),
603        4 => kw_len4(b),
604        5 => kw_len5(b),
605        6 => kw_len6(b),
606        7 => kw_len7(b),
607        8 => kw_len8(b),
608        9 => kw_len9(b),
609        10 => kw_len10(b),
610        11 => kw_len11(b),
611        12 => kw_len12(b),
612        _ => None,
613    };
614    match tok {
615        Some(t) => t,
616        // Ident fall-through: this is the only path that allocates.
617        None => Token::Ident(raw.to_ascii_lowercase()),
618    }
619}
620
621/// ASCII-CI equality on a byte slice against a lowercase literal.
622/// Letters that differ only in case satisfy `(a ^ b) == 0x20`; other
623/// mismatches set bits outside the 0x20 mask. We compare each byte
624/// against its lowercase form via `to_ascii_lowercase` for clarity;
625/// the compiler folds the loop into a tight cmov chain.
626#[inline]
627fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
628    if input.len() != lower.len() {
629        return false;
630    }
631    for i in 0..lower.len() {
632        if input[i].to_ascii_lowercase() != lower[i] {
633            return false;
634        }
635    }
636    true
637}
638
639#[inline]
640fn kw_len2(b: &[u8]) -> Option<Token> {
641    // 7 keywords: as, by, in, is, on, or, to
642    if eq_ci(b, b"as") {
643        return Some(Token::As);
644    }
645    if eq_ci(b, b"by") {
646        return Some(Token::By);
647    }
648    if eq_ci(b, b"in") {
649        return Some(Token::In);
650    }
651    if eq_ci(b, b"is") {
652        return Some(Token::Is);
653    }
654    if eq_ci(b, b"on") {
655        return Some(Token::On);
656    }
657    if eq_ci(b, b"or") {
658        return Some(Token::Or);
659    }
660    if eq_ci(b, b"to") {
661        return Some(Token::To);
662    }
663    None
664}
665
666#[inline]
667fn kw_len3(b: &[u8]) -> Option<Token> {
668    // 5 keywords: all, and, asc, not, for
669    if eq_ci(b, b"for") {
670        return Some(Token::For);
671    }
672    if eq_ci(b, b"all") {
673        return Some(Token::All);
674    }
675    if eq_ci(b, b"and") {
676        return Some(Token::And);
677    }
678    if eq_ci(b, b"asc") {
679        return Some(Token::Asc);
680    }
681    if eq_ci(b, b"not") {
682        return Some(Token::Not);
683    }
684    None
685}
686
687#[inline]
688fn kw_len4(b: &[u8]) -> Option<Token> {
689    // 10 keywords: from, null, true, into, like, join, left, show, desc, drop
690    if eq_ci(b, b"from") {
691        return Some(Token::From);
692    }
693    if eq_ci(b, b"drop") {
694        return Some(Token::Drop);
695    }
696    if eq_ci(b, b"null") {
697        return Some(Token::Null);
698    }
699    if eq_ci(b, b"true") {
700        return Some(Token::True);
701    }
702    if eq_ci(b, b"into") {
703        return Some(Token::Into);
704    }
705    if eq_ci(b, b"like") {
706        return Some(Token::Like);
707    }
708    if eq_ci(b, b"join") {
709        return Some(Token::Join);
710    }
711    if eq_ci(b, b"left") {
712        return Some(Token::Left);
713    }
714    if eq_ci(b, b"show") {
715        return Some(Token::Show);
716    }
717    if eq_ci(b, b"desc") {
718        return Some(Token::Desc);
719    }
720    None
721}
722
723#[inline]
724fn kw_len5(b: &[u8]) -> Option<Token> {
725    // 12 keywords: false, where, table, index, begin, order, limit,
726    // group, union, inner, cross, outer
727    if eq_ci(b, b"false") {
728        return Some(Token::False);
729    }
730    if eq_ci(b, b"where") {
731        return Some(Token::Where);
732    }
733    if eq_ci(b, b"table") {
734        return Some(Token::Table);
735    }
736    if eq_ci(b, b"index") {
737        return Some(Token::Index);
738    }
739    if eq_ci(b, b"begin") {
740        return Some(Token::Begin);
741    }
742    if eq_ci(b, b"order") {
743        return Some(Token::Order);
744    }
745    if eq_ci(b, b"limit") {
746        return Some(Token::Limit);
747    }
748    if eq_ci(b, b"group") {
749        return Some(Token::Group);
750    }
751    if eq_ci(b, b"union") {
752        return Some(Token::Union);
753    }
754    if eq_ci(b, b"inner") {
755        return Some(Token::Inner);
756    }
757    if eq_ci(b, b"cross") {
758        return Some(Token::Cross);
759    }
760    if eq_ci(b, b"outer") {
761        return Some(Token::Outer);
762    }
763    None
764}
765
766#[inline]
767fn kw_len6(b: &[u8]) -> Option<Token> {
768    // 9 keywords: select, create, insert, values, commit, having, offset, tables, except
769    if eq_ci(b, b"select") {
770        return Some(Token::Select);
771    }
772    if eq_ci(b, b"tables") {
773        return Some(Token::Tables);
774    }
775    if eq_ci(b, b"except") {
776        return Some(Token::Except);
777    }
778    if eq_ci(b, b"create") {
779        return Some(Token::Create);
780    }
781    if eq_ci(b, b"insert") {
782        return Some(Token::Insert);
783    }
784    if eq_ci(b, b"values") {
785        return Some(Token::Values);
786    }
787    if eq_ci(b, b"commit") {
788        return Some(Token::Commit);
789    }
790    if eq_ci(b, b"having") {
791        return Some(Token::Having);
792    }
793    if eq_ci(b, b"offset") {
794        return Some(Token::Offset);
795    }
796    None
797}
798
799#[inline]
800fn kw_len7(b: &[u8]) -> Option<Token> {
801    // 4 keywords: between, default, release, extract
802    if eq_ci(b, b"between") {
803        return Some(Token::Between);
804    }
805    if eq_ci(b, b"default") {
806        return Some(Token::Default);
807    }
808    if eq_ci(b, b"release") {
809        return Some(Token::Release);
810    }
811    if eq_ci(b, b"extract") {
812        return Some(Token::Extract);
813    }
814    None
815}
816
817#[inline]
818fn kw_len8(b: &[u8]) -> Option<Token> {
819    // 3 keywords: rollback, distinct, interval
820    if eq_ci(b, b"rollback") {
821        return Some(Token::Rollback);
822    }
823    if eq_ci(b, b"distinct") {
824        return Some(Token::Distinct);
825    }
826    if eq_ci(b, b"interval") {
827        return Some(Token::Interval);
828    }
829    None
830}
831
832#[inline]
833fn kw_len9(b: &[u8]) -> Option<Token> {
834    // 1 keyword: savepoint
835    if eq_ci(b, b"savepoint") {
836        return Some(Token::Savepoint);
837    }
838    None
839}
840
841#[inline]
842fn kw_len10(b: &[u8]) -> Option<Token> {
843    // 1 keyword: connection
844    if eq_ci(b, b"connection") {
845        return Some(Token::Connection);
846    }
847    None
848}
849
850#[inline]
851fn kw_len11(b: &[u8]) -> Option<Token> {
852    // 1 keyword: publication
853    if eq_ci(b, b"publication") {
854        return Some(Token::Publication);
855    }
856    None
857}
858
859#[inline]
860fn kw_len12(b: &[u8]) -> Option<Token> {
861    // 1 keyword: subscription
862    if eq_ci(b, b"subscription") {
863        return Some(Token::Subscription);
864    }
865    None
866}
867
868/// Lex a `'...'` string literal or `"..."` quoted identifier. The opening
869/// quote sits at `input[start]`; `quote` is its byte value. `is_ident` selects
870/// the resulting token shape.
871///
872/// PG-style doubling escapes the quote: `''` inside `'...'` is a literal `'`,
873/// same for `""` inside `"..."`.
874fn lex_quoted(
875    input: &str,
876    start: usize,
877    quote: u8,
878    is_ident: bool,
879) -> Result<(Token, usize), LexError> {
880    let bytes = input.as_bytes();
881    let mut i = start + 1;
882    let mut s = String::new();
883    loop {
884        if i >= bytes.len() {
885            return Err(LexError {
886                kind: if is_ident {
887                    LexErrorKind::UnterminatedQuotedIdent
888                } else {
889                    LexErrorKind::UnterminatedString
890                },
891                pos: start,
892            });
893        }
894        if bytes[i] == quote {
895            if peek_eq(bytes, i + 1, quote) {
896                s.push(quote as char);
897                i += 2;
898            } else {
899                i += 1;
900                break;
901            }
902        } else {
903            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
904            s.push(ch);
905            i += ch.len_utf8();
906        }
907    }
908    let tok = if is_ident {
909        Token::QuotedIdent(s)
910    } else {
911        Token::String(s)
912    };
913    Ok((tok, i - start))
914}
915
916fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
917    let bytes = s.as_bytes();
918    let mut i = 0usize;
919    let mut is_float = false;
920
921    while i < bytes.len() && bytes[i].is_ascii_digit() {
922        i += 1;
923    }
924    if i < bytes.len() && bytes[i] == b'.' {
925        is_float = true;
926        i += 1;
927        while i < bytes.len() && bytes[i].is_ascii_digit() {
928            i += 1;
929        }
930    }
931    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
932        is_float = true;
933        i += 1;
934        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
935            i += 1;
936        }
937        let exp_start = i;
938        while i < bytes.len() && bytes[i].is_ascii_digit() {
939            i += 1;
940        }
941        if exp_start == i {
942            return Err(LexErrorKind::BadNumber(s[..i].to_string()));
943        }
944    }
945
946    let lit = &s[..i];
947    if is_float {
948        lit.parse::<f64>()
949            .map(|v| (Token::Float(v), i))
950            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
951    } else {
952        lit.parse::<i64>()
953            .map(|v| (Token::Integer(v), i))
954            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
955    }
956}
957
958#[cfg(test)]
959mod tests {
960    use super::*;
961    use alloc::vec;
962
963    fn lex(s: &str) -> Vec<Token> {
964        tokenize(s).expect("lex ok")
965    }
966
967    #[test]
968    fn empty_yields_only_eof() {
969        assert_eq!(lex(""), vec![Token::Eof]);
970    }
971
972    #[test]
973    fn whitespace_only_yields_only_eof() {
974        assert_eq!(lex("   \t\n  "), vec![Token::Eof]);
975    }
976
977    #[test]
978    fn keywords_are_case_insensitive() {
979        assert_eq!(
980            lex("SELECT select Select"),
981            vec![Token::Select, Token::Select, Token::Select, Token::Eof]
982        );
983    }
984
985    #[test]
986    fn identifiers_lowercase_ascii() {
987        assert_eq!(
988            lex("hello WORLD _x x1"),
989            vec![
990                Token::Ident("hello".into()),
991                Token::Ident("world".into()),
992                Token::Ident("_x".into()),
993                Token::Ident("x1".into()),
994                Token::Eof,
995            ]
996        );
997    }
998
999    #[test]
1000    fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1001        assert_eq!(
1002            lex(r#""User Name" "a""b""#),
1003            vec![
1004                Token::QuotedIdent("User Name".into()),
1005                Token::QuotedIdent("a\"b".into()),
1006                Token::Eof,
1007            ]
1008        );
1009    }
1010
1011    #[test]
1012    fn integer_and_float_literals() {
1013        assert_eq!(
1014            lex("0 42 1.5 .5 1e10 2.5e-3"),
1015            vec![
1016                Token::Integer(0),
1017                Token::Integer(42),
1018                Token::Float(1.5),
1019                Token::Float(0.5),
1020                Token::Float(1e10),
1021                Token::Float(2.5e-3),
1022                Token::Eof,
1023            ]
1024        );
1025    }
1026
1027    #[test]
1028    fn negative_number_is_minus_then_integer() {
1029        // PG follows this: unary minus is a separate token, parser folds it.
1030        assert_eq!(
1031            lex("-42"),
1032            vec![Token::Minus, Token::Integer(42), Token::Eof]
1033        );
1034    }
1035
1036    #[test]
1037    fn string_literal_doubled_quote_escape() {
1038        assert_eq!(
1039            lex("'hello' 'it''s'"),
1040            vec![
1041                Token::String("hello".into()),
1042                Token::String("it's".into()),
1043                Token::Eof,
1044            ]
1045        );
1046    }
1047
1048    #[test]
1049    fn all_comparison_and_arithmetic_operators() {
1050        assert_eq!(
1051            lex("= <> != < <= > >= + - * /"),
1052            vec![
1053                Token::Eq,
1054                Token::NotEq,
1055                Token::NotEq,
1056                Token::Lt,
1057                Token::LtEq,
1058                Token::Gt,
1059                Token::GtEq,
1060                Token::Plus,
1061                Token::Minus,
1062                Token::Star,
1063                Token::Slash,
1064                Token::Eof,
1065            ]
1066        );
1067    }
1068
1069    #[test]
1070    fn punctuation() {
1071        assert_eq!(
1072            lex("( ) , ; ."),
1073            vec![
1074                Token::LParen,
1075                Token::RParen,
1076                Token::Comma,
1077                Token::Semicolon,
1078                Token::Dot,
1079                Token::Eof,
1080            ]
1081        );
1082    }
1083
1084    #[test]
1085    fn line_comment_skipped() {
1086        assert_eq!(
1087            lex("SELECT -- trailing junk\nFROM"),
1088            vec![Token::Select, Token::From, Token::Eof]
1089        );
1090    }
1091
1092    #[test]
1093    fn block_comment_skipped() {
1094        assert_eq!(
1095            lex("SELECT /* skipped */ 1"),
1096            vec![Token::Select, Token::Integer(1), Token::Eof]
1097        );
1098    }
1099
1100    #[test]
1101    fn unterminated_string_errors() {
1102        let err = tokenize("'oops").unwrap_err();
1103        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1104        assert_eq!(err.pos, 0);
1105    }
1106
1107    #[test]
1108    fn unterminated_block_comment_errors() {
1109        let err = tokenize("/* never closed").unwrap_err();
1110        assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1111    }
1112
1113    #[test]
1114    fn unknown_char_errors() {
1115        let err = tokenize("@").unwrap_err();
1116        assert!(matches!(err.kind, LexErrorKind::UnknownChar('@')));
1117    }
1118
1119    #[test]
1120    fn dot_in_qualified_column() {
1121        assert_eq!(
1122            lex("t.col"),
1123            vec![
1124                Token::Ident("t".into()),
1125                Token::Dot,
1126                Token::Ident("col".into()),
1127                Token::Eof,
1128            ]
1129        );
1130    }
1131
1132    // --- v0.11 brackets + distance op + vector keyword --------------------
1133
1134    #[test]
1135    fn brackets_are_distinct_tokens() {
1136        assert_eq!(
1137            lex("[ ]"),
1138            vec![Token::LBracket, Token::RBracket, Token::Eof]
1139        );
1140    }
1141
1142    #[test]
1143    fn l2_distance_is_three_char_token() {
1144        assert_eq!(
1145            lex("a <-> b"),
1146            vec![
1147                Token::Ident("a".into()),
1148                Token::L2Distance,
1149                Token::Ident("b".into()),
1150                Token::Eof,
1151            ]
1152        );
1153        // Bare `<-` should NOT match L2Distance.
1154        assert_eq!(
1155            lex("a <- b"),
1156            vec![
1157                Token::Ident("a".into()),
1158                Token::Lt,
1159                Token::Minus,
1160                Token::Ident("b".into()),
1161                Token::Eof,
1162            ]
1163        );
1164    }
1165
1166    #[test]
1167    fn order_by_limit_are_keywords() {
1168        assert_eq!(
1169            lex("ORDER BY LIMIT"),
1170            vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1171        );
1172    }
1173
1174    // --- v1.2: pgvector distance ops + PG cast --------------------------
1175
1176    #[test]
1177    fn inner_product_operator_3char() {
1178        assert_eq!(
1179            lex("a <#> b"),
1180            vec![
1181                Token::Ident("a".into()),
1182                Token::InnerProduct,
1183                Token::Ident("b".into()),
1184                Token::Eof,
1185            ]
1186        );
1187    }
1188
1189    #[test]
1190    fn cosine_distance_operator_3char() {
1191        assert_eq!(
1192            lex("a <=> b"),
1193            vec![
1194                Token::Ident("a".into()),
1195                Token::CosineDistance,
1196                Token::Ident("b".into()),
1197                Token::Eof,
1198            ]
1199        );
1200        // Make sure `<=` and `<>` and `<->` still lex right when `<=>` is
1201        // around (greedy match takes the longest).
1202        assert_eq!(
1203            lex("a <= b"),
1204            vec![
1205                Token::Ident("a".into()),
1206                Token::LtEq,
1207                Token::Ident("b".into()),
1208                Token::Eof,
1209            ]
1210        );
1211    }
1212
1213    #[test]
1214    fn double_colon_cast_token() {
1215        assert_eq!(
1216            lex("x::INT"),
1217            vec![
1218                Token::Ident("x".into()),
1219                Token::DoubleColon,
1220                Token::Ident("int".into()),
1221                Token::Eof,
1222            ]
1223        );
1224    }
1225
1226    #[test]
1227    fn lone_single_colon_lexes_as_colon_token() {
1228        // v7.12.4 — single `:` is now a token (PL/pgSQL surface
1229        // + tsvector external-form literal both need it). The
1230        // pre-v7.12.4 "single colon = unknown char" behaviour
1231        // was incidental.
1232        let toks = tokenize(":x").expect("colon now lexes");
1233        assert_eq!(toks[0], Token::Colon);
1234    }
1235
1236    #[test]
1237    fn colon_eq_lexes_as_assignment() {
1238        // v7.12.4 — PL/pgSQL assignment operator.
1239        let toks = tokenize("x := 1").expect("colon-eq lexes");
1240        // Tokens: Ident("x"), ColonEq, NumberLiteral
1241        assert!(matches!(toks[1], Token::ColonEq));
1242    }
1243}
spg_sql/lexer.rs

spg_sql/
lexer.rs