spg_sql/
lexer.rs

1//! Lexer for the PG-dialect subset that SPG accepts.
2//!
3//! v0.2 token stream is value-only — no source spans yet. Errors do report
4//! the byte offset where the offending construct started. Identifiers are
5//! ASCII case-folded to lower-case (matches PG when un-quoted). Quoted
6//! identifiers (`"..."`) preserve case; `""` is an embedded quote.
7//! String literals (`'...'`) follow PG single-quote convention with `''`
8//! as the embedded quote. The lexer accepts but does not interpret E-strings
9//! or dollar-quoted strings — those land in a later milestone.
10
11use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords
18    Select,
19    From,
20    Where,
21    As,
22    Null,
23    True,
24    False,
25    And,
26    Or,
27    Not,
28    Create,
29    Table,
30    Insert,
31    Into,
32    Values,
33    Index,
34    On,
35    Begin,
36    Commit,
37    Rollback,
38    Order,
39    By,
40    Limit,
41
42    // Identifiers
43    Ident(String),       // ASCII case-folded
44    QuotedIdent(String), // original case, "" → "
45
46    // Literals
47    Integer(i64),
48    Float(f64),
49    String(String),
50
51    // Operators
52    Plus,
53    Minus,
54    Star,
55    Slash,
56    Eq,
57    NotEq,
58    Lt,
59    LtEq,
60    Gt,
61    GtEq,
62
63    // Punctuation
64    LParen,
65    RParen,
66    LBracket,
67    RBracket,
68    Comma,
69    Semicolon,
70    Dot,
71    /// pgvector L2 distance operator `<->`. Lexed as one token so the
72    /// parser can give it its own precedence rung.
73    /// v4.14 `->` — JSON object/array element access, returns json.
74    JsonGet,
75    /// v4.14 `->>` — same access, returns text.
76    JsonGetText,
77    /// v6.4.5 `#>` — JSON path walk, returns json. Path is the
78    /// right-hand TEXT with PG `{a,b,0}` syntax.
79    JsonGetPath,
80    /// v6.4.5 `#>>` — same walk, returns text.
81    JsonGetPathText,
82    /// v6.4.5 `@>` — JSON containment. `j @> sub` returns true if
83    /// every key/value in `sub` is present in `j` with structural
84    /// containment for objects + arrays.
85    JsonContains,
86    /// v7.12.2 `@@` — tsvector / tsquery match. Either ordering
87    /// (`vec @@ q` or `q @@ vec`) parses; engine eval normalises
88    /// before matching.
89    TsMatch,
90    L2Distance,
91    /// pgvector inner-product operator `<#>` (returns negative dot product
92    /// so smaller still means more similar — same semantics as pgvector).
93    InnerProduct,
94    /// pgvector cosine distance operator `<=>`.
95    CosineDistance,
96    /// PG-style cast `expr::type` — single token because we want it to bind
97    /// at postfix precedence.
98    DoubleColon,
99    /// v7.12.4 — PL/pgSQL assignment operator `:=`.
100    /// Outside PL/pgSQL bodies this token has no SQL-side meaning.
101    ColonEq,
102    /// v7.12.4 — bare `:` separator. Used inside `tsvector` external-form
103    /// literals (`'cat:1 dog:2'::tsvector`) and as the fallback path for
104    /// the PL/pgSQL assignment lexer.
105    Colon,
106    /// Standard SQL string concatenation `||`.
107    Concat,
108    /// `IS` keyword — postfix `IS NULL` / `IS NOT NULL` predicates.
109    Is,
110    Between,
111    In,
112    Like,
113    Group,
114    Distinct,
115    Union,
116    All,
117    Join,
118    Inner,
119    Left,
120    Cross,
121    Outer,
122    Default,
123    Savepoint,
124    Release,
125    To,
126    Having,
127    Show,
128    Extract,
129    Offset,
130    Asc,
131    Desc,
132    /// `INTERVAL` — followed by a string literal carrying the span text
133    /// (e.g. `INTERVAL '1 day 2 hours'`).
134    Interval,
135    /// v6.1.1 — `$N` parameter placeholder for the extended query
136    /// protocol. The number N is 1-based per PostgreSQL convention.
137    /// `0` and `$0` are not valid; the lexer rejects them.
138    Placeholder(u16),
139
140    /// v6.1.2 — `DROP` keyword. Used by `DROP PUBLICATION <name>`.
141    /// Reserved for future `DROP TABLE` / `DROP INDEX` / `DROP USER`
142    /// surface that currently goes through SHOW-shaped admin SQL.
143    Drop,
144    /// v6.1.2 — `FOR` keyword (publication scope).
145    For,
146    /// v6.1.2 — `TABLES` plural keyword (`FOR ALL TABLES`,
147    /// `FOR ALL TABLES EXCEPT …`). The existing `TABLE` keyword
148    /// stays a separate token so `CREATE TABLE`'s single-table
149    /// form keeps lexing as today.
150    Tables,
151    /// v6.1.3 (reserved at v6.1.2 to keep the AST shape stable) —
152    /// `EXCEPT` keyword for `FOR ALL TABLES EXCEPT t1, t2`.
153    Except,
154    /// v6.1.2 — `PUBLICATION` keyword.
155    Publication,
156    /// v6.1.4 (reserved at v6.1.2) — `SUBSCRIPTION` keyword.
157    Subscription,
158    /// v6.1.4 — `CONNECTION` keyword (for
159    /// `CREATE SUBSCRIPTION … CONNECTION '<conn_str>' …`).
160    Connection,
161
162    Eof,
163}
164
165#[derive(Debug, Clone, PartialEq, Eq)]
166pub enum LexErrorKind {
167    UnknownChar(char),
168    UnterminatedString,
169    UnterminatedQuotedIdent,
170    UnterminatedBlockComment,
171    BadNumber(String),
172}
173
174#[derive(Debug, Clone, PartialEq, Eq)]
175pub struct LexError {
176    pub kind: LexErrorKind,
177    pub pos: usize,
178}
179
180impl fmt::Display for LexError {
181    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
182        match &self.kind {
183            LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
184            LexErrorKind::UnterminatedString => {
185                write!(f, "unterminated string literal at byte {}", self.pos)
186            }
187            LexErrorKind::UnterminatedQuotedIdent => {
188                write!(f, "unterminated quoted identifier at byte {}", self.pos)
189            }
190            LexErrorKind::UnterminatedBlockComment => {
191                write!(f, "unterminated /* */ comment at byte {}", self.pos)
192            }
193            LexErrorKind::BadNumber(s) => {
194                write!(f, "invalid number literal {s:?} at byte {}", self.pos)
195            }
196        }
197    }
198}
199
200/// Tokenize `input` into a `Vec<Token>` ending in `Token::Eof`.
201#[allow(clippy::too_many_lines)] // big match — splitting would obscure the dispatch table
202pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
203    let bytes = input.as_bytes();
204    let mut i = 0usize;
205    let mut out = Vec::new();
206
207    while i < bytes.len() {
208        let b = bytes[i];
209        match b {
210            b' ' | b'\t' | b'\n' | b'\r' => {
211                i += 1;
212            }
213            b'-' if peek_eq(bytes, i + 1, b'-') => {
214                i += 2;
215                while i < bytes.len() && bytes[i] != b'\n' {
216                    i += 1;
217                }
218            }
219            b'/' if peek_eq(bytes, i + 1, b'*') => {
220                let start = i;
221                i += 2;
222                let mut closed = false;
223                while i + 1 < bytes.len() {
224                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
225                        i += 2;
226                        closed = true;
227                        break;
228                    }
229                    i += 1;
230                }
231                if !closed {
232                    return Err(LexError {
233                        kind: LexErrorKind::UnterminatedBlockComment,
234                        pos: start,
235                    });
236                }
237            }
238            b'\'' => {
239                let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
240                out.push(tok);
241                i += consumed;
242            }
243            b'"' => {
244                let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
245                out.push(tok);
246                i += consumed;
247            }
248            // MySQL-flavoured backtick-quoted identifier. Same semantics
249            // as the standard `"..."` form, including embedded "``" as
250            // a literal backtick.
251            b'`' => {
252                let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
253                out.push(tok);
254                i += consumed;
255            }
256            b if b.is_ascii_alphabetic() || b == b'_' => {
257                let start = i;
258                i += 1;
259                while i < bytes.len() {
260                    let c = bytes[i];
261                    if c.is_ascii_alphanumeric() || c == b'_' {
262                        i += 1;
263                    } else {
264                        break;
265                    }
266                }
267                let raw = &input[start..i];
268                // v3.0.5: try the keyword table case-insensitively
269                // without allocating; only the ident fall-through
270                // pays for a lowercase String.
271                out.push(keyword_or_ident_raw(raw));
272            }
273            b if b.is_ascii_digit() => {
274                let (tok, consumed) =
275                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
276                out.push(tok);
277                i += consumed;
278            }
279            b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
280                let (tok, consumed) =
281                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
282                out.push(tok);
283                i += consumed;
284            }
285            b'+' => single(&mut out, Token::Plus, &mut i),
286            b'-' => {
287                // v4.14: `->>` and `->` for JSON path access. `->>`
288                // must be tried before `->` (longest match).
289                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
290                    out.push(Token::JsonGetText);
291                    i += 3;
292                } else if peek_eq(bytes, i + 1, b'>') {
293                    out.push(Token::JsonGet);
294                    i += 2;
295                } else {
296                    single(&mut out, Token::Minus, &mut i);
297                }
298            }
299            // v6.4.5: `#>>` and `#>` JSON path walk.
300            b'#' => {
301                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
302                    out.push(Token::JsonGetPathText);
303                    i += 3;
304                } else if peek_eq(bytes, i + 1, b'>') {
305                    out.push(Token::JsonGetPath);
306                    i += 2;
307                } else {
308                    return Err(LexError {
309                        kind: LexErrorKind::UnknownChar('#'),
310                        pos: i,
311                    });
312                }
313            }
314            // v6.4.5: `@>` JSON containment.
315            // v7.12.2: `@@` tsvector / tsquery match.
316            b'@' => {
317                if peek_eq(bytes, i + 1, b'>') {
318                    out.push(Token::JsonContains);
319                    i += 2;
320                } else if peek_eq(bytes, i + 1, b'@') {
321                    out.push(Token::TsMatch);
322                    i += 2;
323                } else {
324                    return Err(LexError {
325                        kind: LexErrorKind::UnknownChar('@'),
326                        pos: i,
327                    });
328                }
329            }
330            b'*' => single(&mut out, Token::Star, &mut i),
331            b'/' => single(&mut out, Token::Slash, &mut i),
332            b'(' => single(&mut out, Token::LParen, &mut i),
333            b')' => single(&mut out, Token::RParen, &mut i),
334            b'[' => single(&mut out, Token::LBracket, &mut i),
335            b']' => single(&mut out, Token::RBracket, &mut i),
336            b',' => single(&mut out, Token::Comma, &mut i),
337            b';' => single(&mut out, Token::Semicolon, &mut i),
338            b'.' => single(&mut out, Token::Dot, &mut i),
339            b'=' => single(&mut out, Token::Eq, &mut i),
340            b'<' => {
341                if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
342                    out.push(Token::CosineDistance);
343                    i += 3;
344                } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
345                    out.push(Token::InnerProduct);
346                    i += 3;
347                } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
348                    out.push(Token::L2Distance);
349                    i += 3;
350                } else if peek_eq(bytes, i + 1, b'=') {
351                    out.push(Token::LtEq);
352                    i += 2;
353                } else if peek_eq(bytes, i + 1, b'>') {
354                    out.push(Token::NotEq);
355                    i += 2;
356                } else {
357                    out.push(Token::Lt);
358                    i += 1;
359                }
360            }
361            b':' if peek_eq(bytes, i + 1, b':') => {
362                out.push(Token::DoubleColon);
363                i += 2;
364            }
365            b':' if peek_eq(bytes, i + 1, b'=') => {
366                // v7.12.4 — PL/pgSQL assignment operator `:=`.
367                out.push(Token::ColonEq);
368                i += 2;
369            }
370            b':' => {
371                // v7.12.4 — bare `:`. Used inside `tsvector` external-form
372                // literals which the cast parser consumes in-token, and as a
373                // separator the PL/pgSQL assignment lexer can recover from.
374                out.push(Token::Colon);
375                i += 1;
376            }
377            b'|' if peek_eq(bytes, i + 1, b'|') => {
378                out.push(Token::Concat);
379                i += 2;
380            }
381            b'>' => {
382                if peek_eq(bytes, i + 1, b'=') {
383                    out.push(Token::GtEq);
384                    i += 2;
385                } else {
386                    out.push(Token::Gt);
387                    i += 1;
388                }
389            }
390            b'!' if peek_eq(bytes, i + 1, b'=') => {
391                out.push(Token::NotEq);
392                i += 2;
393            }
394            // v7.9.27 — PG dollar-quoted string `$$ … $$` (or
395            // `$tag$ … $tag$`). Used in `DO $$ … $$ LANGUAGE
396            // plpgsql;` blocks that pg_dump emits for idempotent
397            // migrations. SPG has no PL/pgSQL, so the lexer
398            // consumes the entire string as a single Token::String
399            // and the parser treats the surrounding `DO …;` as a
400            // no-op. mailrs follow-up H1.
401            b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
402                // Empty tag form: `$$ … $$`.
403                let end = find_dollar_tag_end(bytes, i + 2, b"$$");
404                let body = match end {
405                    Some(e) => &input[i + 2..e],
406                    None => {
407                        return Err(LexError {
408                            kind: LexErrorKind::UnterminatedString,
409                            pos: i,
410                        });
411                    }
412                };
413                out.push(Token::String(body.to_string()));
414                i = end.unwrap() + 2;
415            }
416            b'$' if i + 1 < bytes.len()
417                && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
418            {
419                // Tagged form: `$foo$ … $foo$`. Scan the tag
420                // ident, find the closing copy.
421                let mut j = i + 1;
422                while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
423                    j += 1;
424                }
425                if j >= bytes.len() || bytes[j] != b'$' {
426                    // Not a dollar-quoted string — fall through
427                    // to the generic-unknown-char path.
428                    let ch = input[i..].chars().next().unwrap_or('?');
429                    return Err(LexError {
430                        kind: LexErrorKind::UnknownChar(ch),
431                        pos: i,
432                    });
433                }
434                let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
435                let end = find_dollar_tag_end(bytes, j + 1, &close);
436                let body = match end {
437                    Some(e) => &input[j + 1..e],
438                    None => {
439                        return Err(LexError {
440                            kind: LexErrorKind::UnterminatedString,
441                            pos: i,
442                        });
443                    }
444                };
445                out.push(Token::String(body.to_string()));
446                i = end.unwrap() + close.len();
447            }
448            // v6.1.1: `$N` parameter placeholder for the extended
449            // query protocol. PG numbers them 1..=N; we reject $0
450            // and a bare `$` not followed by a digit.
451            b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
452                let mut j = i + 1;
453                let mut n: u32 = 0;
454                while j < bytes.len() && bytes[j].is_ascii_digit() {
455                    n = n
456                        .saturating_mul(10)
457                        .saturating_add(u32::from(bytes[j] - b'0'));
458                    j += 1;
459                }
460                if n == 0 || n > u32::from(u16::MAX) {
461                    return Err(LexError {
462                        kind: LexErrorKind::BadNumber(input[i..j].to_string()),
463                        pos: i,
464                    });
465                }
466                #[allow(clippy::cast_possible_truncation)]
467                out.push(Token::Placeholder(n as u16));
468                i = j;
469            }
470            _ => {
471                let ch = input[i..].chars().next().unwrap_or('?');
472                return Err(LexError {
473                    kind: LexErrorKind::UnknownChar(ch),
474                    pos: i,
475                });
476            }
477        }
478    }
479    out.push(Token::Eof);
480    Ok(out)
481}
482
483fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
484    bytes.get(i) == Some(&target)
485}
486
487/// v7.9.27 — find the start index of the next occurrence of `tag`
488/// (e.g. `b"$$"` or `b"$foo$"`) in `bytes` starting at `from`.
489fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
490    if tag.is_empty() || from > bytes.len() {
491        return None;
492    }
493    let mut i = from;
494    while i + tag.len() <= bytes.len() {
495        if &bytes[i..i + tag.len()] == tag {
496            return Some(i);
497        }
498        i += 1;
499    }
500    None
501}
502
503fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
504    bytes.get(i).is_some_and(pred)
505}
506
507fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
508    out.push(tok);
509    *i += 1;
510}
511
512/// Length-first ASCII-CI keyword lookup. Avoids allocating a
513/// lowercase `String` when the input matches a keyword; only the ident
514/// fall-through path pays for the lowercase copy.
515///
516/// Grouped by length so the outer `match` becomes a small jump table.
517/// Within a length bucket every keyword has either a unique first
518/// byte (cheap dispatch) or a small set of disambiguating
519/// trailing-byte comparisons. All comparisons are ASCII-CI (XOR
520/// 0x20 on each byte before the compare).
521fn keyword_or_ident_raw(raw: &str) -> Token {
522    let b = raw.as_bytes();
523    let tok = match b.len() {
524        2 => kw_len2(b),
525        3 => kw_len3(b),
526        4 => kw_len4(b),
527        5 => kw_len5(b),
528        6 => kw_len6(b),
529        7 => kw_len7(b),
530        8 => kw_len8(b),
531        9 => kw_len9(b),
532        10 => kw_len10(b),
533        11 => kw_len11(b),
534        12 => kw_len12(b),
535        _ => None,
536    };
537    match tok {
538        Some(t) => t,
539        // Ident fall-through: this is the only path that allocates.
540        None => Token::Ident(raw.to_ascii_lowercase()),
541    }
542}
543
544/// ASCII-CI equality on a byte slice against a lowercase literal.
545/// Letters that differ only in case satisfy `(a ^ b) == 0x20`; other
546/// mismatches set bits outside the 0x20 mask. We compare each byte
547/// against its lowercase form via `to_ascii_lowercase` for clarity;
548/// the compiler folds the loop into a tight cmov chain.
549#[inline]
550fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
551    if input.len() != lower.len() {
552        return false;
553    }
554    for i in 0..lower.len() {
555        if input[i].to_ascii_lowercase() != lower[i] {
556            return false;
557        }
558    }
559    true
560}
561
562#[inline]
563fn kw_len2(b: &[u8]) -> Option<Token> {
564    // 7 keywords: as, by, in, is, on, or, to
565    if eq_ci(b, b"as") {
566        return Some(Token::As);
567    }
568    if eq_ci(b, b"by") {
569        return Some(Token::By);
570    }
571    if eq_ci(b, b"in") {
572        return Some(Token::In);
573    }
574    if eq_ci(b, b"is") {
575        return Some(Token::Is);
576    }
577    if eq_ci(b, b"on") {
578        return Some(Token::On);
579    }
580    if eq_ci(b, b"or") {
581        return Some(Token::Or);
582    }
583    if eq_ci(b, b"to") {
584        return Some(Token::To);
585    }
586    None
587}
588
589#[inline]
590fn kw_len3(b: &[u8]) -> Option<Token> {
591    // 5 keywords: all, and, asc, not, for
592    if eq_ci(b, b"for") {
593        return Some(Token::For);
594    }
595    if eq_ci(b, b"all") {
596        return Some(Token::All);
597    }
598    if eq_ci(b, b"and") {
599        return Some(Token::And);
600    }
601    if eq_ci(b, b"asc") {
602        return Some(Token::Asc);
603    }
604    if eq_ci(b, b"not") {
605        return Some(Token::Not);
606    }
607    None
608}
609
610#[inline]
611fn kw_len4(b: &[u8]) -> Option<Token> {
612    // 10 keywords: from, null, true, into, like, join, left, show, desc, drop
613    if eq_ci(b, b"from") {
614        return Some(Token::From);
615    }
616    if eq_ci(b, b"drop") {
617        return Some(Token::Drop);
618    }
619    if eq_ci(b, b"null") {
620        return Some(Token::Null);
621    }
622    if eq_ci(b, b"true") {
623        return Some(Token::True);
624    }
625    if eq_ci(b, b"into") {
626        return Some(Token::Into);
627    }
628    if eq_ci(b, b"like") {
629        return Some(Token::Like);
630    }
631    if eq_ci(b, b"join") {
632        return Some(Token::Join);
633    }
634    if eq_ci(b, b"left") {
635        return Some(Token::Left);
636    }
637    if eq_ci(b, b"show") {
638        return Some(Token::Show);
639    }
640    if eq_ci(b, b"desc") {
641        return Some(Token::Desc);
642    }
643    None
644}
645
646#[inline]
647fn kw_len5(b: &[u8]) -> Option<Token> {
648    // 12 keywords: false, where, table, index, begin, order, limit,
649    // group, union, inner, cross, outer
650    if eq_ci(b, b"false") {
651        return Some(Token::False);
652    }
653    if eq_ci(b, b"where") {
654        return Some(Token::Where);
655    }
656    if eq_ci(b, b"table") {
657        return Some(Token::Table);
658    }
659    if eq_ci(b, b"index") {
660        return Some(Token::Index);
661    }
662    if eq_ci(b, b"begin") {
663        return Some(Token::Begin);
664    }
665    if eq_ci(b, b"order") {
666        return Some(Token::Order);
667    }
668    if eq_ci(b, b"limit") {
669        return Some(Token::Limit);
670    }
671    if eq_ci(b, b"group") {
672        return Some(Token::Group);
673    }
674    if eq_ci(b, b"union") {
675        return Some(Token::Union);
676    }
677    if eq_ci(b, b"inner") {
678        return Some(Token::Inner);
679    }
680    if eq_ci(b, b"cross") {
681        return Some(Token::Cross);
682    }
683    if eq_ci(b, b"outer") {
684        return Some(Token::Outer);
685    }
686    None
687}
688
689#[inline]
690fn kw_len6(b: &[u8]) -> Option<Token> {
691    // 9 keywords: select, create, insert, values, commit, having, offset, tables, except
692    if eq_ci(b, b"select") {
693        return Some(Token::Select);
694    }
695    if eq_ci(b, b"tables") {
696        return Some(Token::Tables);
697    }
698    if eq_ci(b, b"except") {
699        return Some(Token::Except);
700    }
701    if eq_ci(b, b"create") {
702        return Some(Token::Create);
703    }
704    if eq_ci(b, b"insert") {
705        return Some(Token::Insert);
706    }
707    if eq_ci(b, b"values") {
708        return Some(Token::Values);
709    }
710    if eq_ci(b, b"commit") {
711        return Some(Token::Commit);
712    }
713    if eq_ci(b, b"having") {
714        return Some(Token::Having);
715    }
716    if eq_ci(b, b"offset") {
717        return Some(Token::Offset);
718    }
719    None
720}
721
722#[inline]
723fn kw_len7(b: &[u8]) -> Option<Token> {
724    // 4 keywords: between, default, release, extract
725    if eq_ci(b, b"between") {
726        return Some(Token::Between);
727    }
728    if eq_ci(b, b"default") {
729        return Some(Token::Default);
730    }
731    if eq_ci(b, b"release") {
732        return Some(Token::Release);
733    }
734    if eq_ci(b, b"extract") {
735        return Some(Token::Extract);
736    }
737    None
738}
739
740#[inline]
741fn kw_len8(b: &[u8]) -> Option<Token> {
742    // 3 keywords: rollback, distinct, interval
743    if eq_ci(b, b"rollback") {
744        return Some(Token::Rollback);
745    }
746    if eq_ci(b, b"distinct") {
747        return Some(Token::Distinct);
748    }
749    if eq_ci(b, b"interval") {
750        return Some(Token::Interval);
751    }
752    None
753}
754
755#[inline]
756fn kw_len9(b: &[u8]) -> Option<Token> {
757    // 1 keyword: savepoint
758    if eq_ci(b, b"savepoint") {
759        return Some(Token::Savepoint);
760    }
761    None
762}
763
764#[inline]
765fn kw_len10(b: &[u8]) -> Option<Token> {
766    // 1 keyword: connection
767    if eq_ci(b, b"connection") {
768        return Some(Token::Connection);
769    }
770    None
771}
772
773#[inline]
774fn kw_len11(b: &[u8]) -> Option<Token> {
775    // 1 keyword: publication
776    if eq_ci(b, b"publication") {
777        return Some(Token::Publication);
778    }
779    None
780}
781
782#[inline]
783fn kw_len12(b: &[u8]) -> Option<Token> {
784    // 1 keyword: subscription
785    if eq_ci(b, b"subscription") {
786        return Some(Token::Subscription);
787    }
788    None
789}
790
791/// Lex a `'...'` string literal or `"..."` quoted identifier. The opening
792/// quote sits at `input[start]`; `quote` is its byte value. `is_ident` selects
793/// the resulting token shape.
794///
795/// PG-style doubling escapes the quote: `''` inside `'...'` is a literal `'`,
796/// same for `""` inside `"..."`.
797fn lex_quoted(
798    input: &str,
799    start: usize,
800    quote: u8,
801    is_ident: bool,
802) -> Result<(Token, usize), LexError> {
803    let bytes = input.as_bytes();
804    let mut i = start + 1;
805    let mut s = String::new();
806    loop {
807        if i >= bytes.len() {
808            return Err(LexError {
809                kind: if is_ident {
810                    LexErrorKind::UnterminatedQuotedIdent
811                } else {
812                    LexErrorKind::UnterminatedString
813                },
814                pos: start,
815            });
816        }
817        if bytes[i] == quote {
818            if peek_eq(bytes, i + 1, quote) {
819                s.push(quote as char);
820                i += 2;
821            } else {
822                i += 1;
823                break;
824            }
825        } else {
826            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
827            s.push(ch);
828            i += ch.len_utf8();
829        }
830    }
831    let tok = if is_ident {
832        Token::QuotedIdent(s)
833    } else {
834        Token::String(s)
835    };
836    Ok((tok, i - start))
837}
838
839fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
840    let bytes = s.as_bytes();
841    let mut i = 0usize;
842    let mut is_float = false;
843
844    while i < bytes.len() && bytes[i].is_ascii_digit() {
845        i += 1;
846    }
847    if i < bytes.len() && bytes[i] == b'.' {
848        is_float = true;
849        i += 1;
850        while i < bytes.len() && bytes[i].is_ascii_digit() {
851            i += 1;
852        }
853    }
854    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
855        is_float = true;
856        i += 1;
857        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
858            i += 1;
859        }
860        let exp_start = i;
861        while i < bytes.len() && bytes[i].is_ascii_digit() {
862            i += 1;
863        }
864        if exp_start == i {
865            return Err(LexErrorKind::BadNumber(s[..i].to_string()));
866        }
867    }
868
869    let lit = &s[..i];
870    if is_float {
871        lit.parse::<f64>()
872            .map(|v| (Token::Float(v), i))
873            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
874    } else {
875        lit.parse::<i64>()
876            .map(|v| (Token::Integer(v), i))
877            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
878    }
879}
880
881#[cfg(test)]
882mod tests {
883    use super::*;
884    use alloc::vec;
885
886    fn lex(s: &str) -> Vec<Token> {
887        tokenize(s).expect("lex ok")
888    }
889
890    #[test]
891    fn empty_yields_only_eof() {
892        assert_eq!(lex(""), vec![Token::Eof]);
893    }
894
895    #[test]
896    fn whitespace_only_yields_only_eof() {
897        assert_eq!(lex("   \t\n  "), vec![Token::Eof]);
898    }
899
900    #[test]
901    fn keywords_are_case_insensitive() {
902        assert_eq!(
903            lex("SELECT select Select"),
904            vec![Token::Select, Token::Select, Token::Select, Token::Eof]
905        );
906    }
907
908    #[test]
909    fn identifiers_lowercase_ascii() {
910        assert_eq!(
911            lex("hello WORLD _x x1"),
912            vec![
913                Token::Ident("hello".into()),
914                Token::Ident("world".into()),
915                Token::Ident("_x".into()),
916                Token::Ident("x1".into()),
917                Token::Eof,
918            ]
919        );
920    }
921
922    #[test]
923    fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
924        assert_eq!(
925            lex(r#""User Name" "a""b""#),
926            vec![
927                Token::QuotedIdent("User Name".into()),
928                Token::QuotedIdent("a\"b".into()),
929                Token::Eof,
930            ]
931        );
932    }
933
934    #[test]
935    fn integer_and_float_literals() {
936        assert_eq!(
937            lex("0 42 1.5 .5 1e10 2.5e-3"),
938            vec![
939                Token::Integer(0),
940                Token::Integer(42),
941                Token::Float(1.5),
942                Token::Float(0.5),
943                Token::Float(1e10),
944                Token::Float(2.5e-3),
945                Token::Eof,
946            ]
947        );
948    }
949
950    #[test]
951    fn negative_number_is_minus_then_integer() {
952        // PG follows this: unary minus is a separate token, parser folds it.
953        assert_eq!(
954            lex("-42"),
955            vec![Token::Minus, Token::Integer(42), Token::Eof]
956        );
957    }
958
959    #[test]
960    fn string_literal_doubled_quote_escape() {
961        assert_eq!(
962            lex("'hello' 'it''s'"),
963            vec![
964                Token::String("hello".into()),
965                Token::String("it's".into()),
966                Token::Eof,
967            ]
968        );
969    }
970
971    #[test]
972    fn all_comparison_and_arithmetic_operators() {
973        assert_eq!(
974            lex("= <> != < <= > >= + - * /"),
975            vec![
976                Token::Eq,
977                Token::NotEq,
978                Token::NotEq,
979                Token::Lt,
980                Token::LtEq,
981                Token::Gt,
982                Token::GtEq,
983                Token::Plus,
984                Token::Minus,
985                Token::Star,
986                Token::Slash,
987                Token::Eof,
988            ]
989        );
990    }
991
992    #[test]
993    fn punctuation() {
994        assert_eq!(
995            lex("( ) , ; ."),
996            vec![
997                Token::LParen,
998                Token::RParen,
999                Token::Comma,
1000                Token::Semicolon,
1001                Token::Dot,
1002                Token::Eof,
1003            ]
1004        );
1005    }
1006
1007    #[test]
1008    fn line_comment_skipped() {
1009        assert_eq!(
1010            lex("SELECT -- trailing junk\nFROM"),
1011            vec![Token::Select, Token::From, Token::Eof]
1012        );
1013    }
1014
1015    #[test]
1016    fn block_comment_skipped() {
1017        assert_eq!(
1018            lex("SELECT /* skipped */ 1"),
1019            vec![Token::Select, Token::Integer(1), Token::Eof]
1020        );
1021    }
1022
1023    #[test]
1024    fn unterminated_string_errors() {
1025        let err = tokenize("'oops").unwrap_err();
1026        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1027        assert_eq!(err.pos, 0);
1028    }
1029
1030    #[test]
1031    fn unterminated_block_comment_errors() {
1032        let err = tokenize("/* never closed").unwrap_err();
1033        assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1034    }
1035
1036    #[test]
1037    fn unknown_char_errors() {
1038        let err = tokenize("@").unwrap_err();
1039        assert!(matches!(err.kind, LexErrorKind::UnknownChar('@')));
1040    }
1041
1042    #[test]
1043    fn dot_in_qualified_column() {
1044        assert_eq!(
1045            lex("t.col"),
1046            vec![
1047                Token::Ident("t".into()),
1048                Token::Dot,
1049                Token::Ident("col".into()),
1050                Token::Eof,
1051            ]
1052        );
1053    }
1054
1055    // --- v0.11 brackets + distance op + vector keyword --------------------
1056
1057    #[test]
1058    fn brackets_are_distinct_tokens() {
1059        assert_eq!(
1060            lex("[ ]"),
1061            vec![Token::LBracket, Token::RBracket, Token::Eof]
1062        );
1063    }
1064
1065    #[test]
1066    fn l2_distance_is_three_char_token() {
1067        assert_eq!(
1068            lex("a <-> b"),
1069            vec![
1070                Token::Ident("a".into()),
1071                Token::L2Distance,
1072                Token::Ident("b".into()),
1073                Token::Eof,
1074            ]
1075        );
1076        // Bare `<-` should NOT match L2Distance.
1077        assert_eq!(
1078            lex("a <- b"),
1079            vec![
1080                Token::Ident("a".into()),
1081                Token::Lt,
1082                Token::Minus,
1083                Token::Ident("b".into()),
1084                Token::Eof,
1085            ]
1086        );
1087    }
1088
1089    #[test]
1090    fn order_by_limit_are_keywords() {
1091        assert_eq!(
1092            lex("ORDER BY LIMIT"),
1093            vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1094        );
1095    }
1096
1097    // --- v1.2: pgvector distance ops + PG cast --------------------------
1098
1099    #[test]
1100    fn inner_product_operator_3char() {
1101        assert_eq!(
1102            lex("a <#> b"),
1103            vec![
1104                Token::Ident("a".into()),
1105                Token::InnerProduct,
1106                Token::Ident("b".into()),
1107                Token::Eof,
1108            ]
1109        );
1110    }
1111
1112    #[test]
1113    fn cosine_distance_operator_3char() {
1114        assert_eq!(
1115            lex("a <=> b"),
1116            vec![
1117                Token::Ident("a".into()),
1118                Token::CosineDistance,
1119                Token::Ident("b".into()),
1120                Token::Eof,
1121            ]
1122        );
1123        // Make sure `<=` and `<>` and `<->` still lex right when `<=>` is
1124        // around (greedy match takes the longest).
1125        assert_eq!(
1126            lex("a <= b"),
1127            vec![
1128                Token::Ident("a".into()),
1129                Token::LtEq,
1130                Token::Ident("b".into()),
1131                Token::Eof,
1132            ]
1133        );
1134    }
1135
1136    #[test]
1137    fn double_colon_cast_token() {
1138        assert_eq!(
1139            lex("x::INT"),
1140            vec![
1141                Token::Ident("x".into()),
1142                Token::DoubleColon,
1143                Token::Ident("int".into()),
1144                Token::Eof,
1145            ]
1146        );
1147    }
1148
1149    #[test]
1150    fn lone_single_colon_lexes_as_colon_token() {
1151        // v7.12.4 — single `:` is now a token (PL/pgSQL surface
1152        // + tsvector external-form literal both need it). The
1153        // pre-v7.12.4 "single colon = unknown char" behaviour
1154        // was incidental.
1155        let toks = tokenize(":x").expect("colon now lexes");
1156        assert_eq!(toks[0], Token::Colon);
1157    }
1158
1159    #[test]
1160    fn colon_eq_lexes_as_assignment() {
1161        // v7.12.4 — PL/pgSQL assignment operator.
1162        let toks = tokenize("x := 1").expect("colon-eq lexes");
1163        // Tokens: Ident("x"), ColonEq, NumberLiteral
1164        assert!(matches!(toks[1], Token::ColonEq));
1165    }
1166}
spg_sql/lexer.rs

spg_sql/
lexer.rs