spg_sql/
lexer.rs

1//! Lexer for the PG-dialect subset that SPG accepts.
2//!
3//! v0.2 token stream is value-only — no source spans yet. Errors do report
4//! the byte offset where the offending construct started. Identifiers are
5//! ASCII case-folded to lower-case (matches PG when un-quoted). Quoted
6//! identifiers (`"..."`) preserve case; `""` is an embedded quote.
7//! String literals (`'...'`) follow PG single-quote convention with `''`
8//! as the embedded quote. The lexer accepts but does not interpret E-strings
9//! or dollar-quoted strings — those land in a later milestone.
10
11use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords
18    Select,
19    From,
20    Where,
21    As,
22    Null,
23    True,
24    False,
25    And,
26    Or,
27    Not,
28    Create,
29    Table,
30    Insert,
31    Into,
32    Values,
33    Index,
34    On,
35    Begin,
36    Commit,
37    Rollback,
38    Order,
39    By,
40    Limit,
41
42    // Identifiers
43    Ident(String),       // ASCII case-folded
44    QuotedIdent(String), // original case, "" → "
45
46    // Literals
47    Integer(i64),
48    Float(f64),
49    String(String),
50
51    // Operators
52    Plus,
53    Minus,
54    Star,
55    Slash,
56    Eq,
57    NotEq,
58    Lt,
59    LtEq,
60    Gt,
61    GtEq,
62
63    // Punctuation
64    LParen,
65    RParen,
66    LBracket,
67    RBracket,
68    Comma,
69    Semicolon,
70    Dot,
71    /// pgvector L2 distance operator `<->`. Lexed as one token so the
72    /// parser can give it its own precedence rung.
73    /// v4.14 `->` — JSON object/array element access, returns json.
74    JsonGet,
75    /// v4.14 `->>` — same access, returns text.
76    JsonGetText,
77    /// v6.4.5 `#>` — JSON path walk, returns json. Path is the
78    /// right-hand TEXT with PG `{a,b,0}` syntax.
79    JsonGetPath,
80    /// v6.4.5 `#>>` — same walk, returns text.
81    JsonGetPathText,
82    /// v6.4.5 `@>` — JSON containment. `j @> sub` returns true if
83    /// every key/value in `sub` is present in `j` with structural
84    /// containment for objects + arrays.
85    JsonContains,
86    L2Distance,
87    /// pgvector inner-product operator `<#>` (returns negative dot product
88    /// so smaller still means more similar — same semantics as pgvector).
89    InnerProduct,
90    /// pgvector cosine distance operator `<=>`.
91    CosineDistance,
92    /// PG-style cast `expr::type` — single token because we want it to bind
93    /// at postfix precedence.
94    DoubleColon,
95    /// Standard SQL string concatenation `||`.
96    Concat,
97    /// `IS` keyword — postfix `IS NULL` / `IS NOT NULL` predicates.
98    Is,
99    Between,
100    In,
101    Like,
102    Group,
103    Distinct,
104    Union,
105    All,
106    Join,
107    Inner,
108    Left,
109    Cross,
110    Outer,
111    Default,
112    Savepoint,
113    Release,
114    To,
115    Having,
116    Show,
117    Extract,
118    Offset,
119    Asc,
120    Desc,
121    /// `INTERVAL` — followed by a string literal carrying the span text
122    /// (e.g. `INTERVAL '1 day 2 hours'`).
123    Interval,
124    /// v6.1.1 — `$N` parameter placeholder for the extended query
125    /// protocol. The number N is 1-based per PostgreSQL convention.
126    /// `0` and `$0` are not valid; the lexer rejects them.
127    Placeholder(u16),
128
129    /// v6.1.2 — `DROP` keyword. Used by `DROP PUBLICATION <name>`.
130    /// Reserved for future `DROP TABLE` / `DROP INDEX` / `DROP USER`
131    /// surface that currently goes through SHOW-shaped admin SQL.
132    Drop,
133    /// v6.1.2 — `FOR` keyword (publication scope).
134    For,
135    /// v6.1.2 — `TABLES` plural keyword (`FOR ALL TABLES`,
136    /// `FOR ALL TABLES EXCEPT …`). The existing `TABLE` keyword
137    /// stays a separate token so `CREATE TABLE`'s single-table
138    /// form keeps lexing as today.
139    Tables,
140    /// v6.1.3 (reserved at v6.1.2 to keep the AST shape stable) —
141    /// `EXCEPT` keyword for `FOR ALL TABLES EXCEPT t1, t2`.
142    Except,
143    /// v6.1.2 — `PUBLICATION` keyword.
144    Publication,
145    /// v6.1.4 (reserved at v6.1.2) — `SUBSCRIPTION` keyword.
146    Subscription,
147    /// v6.1.4 — `CONNECTION` keyword (for
148    /// `CREATE SUBSCRIPTION … CONNECTION '<conn_str>' …`).
149    Connection,
150
151    Eof,
152}
153
154#[derive(Debug, Clone, PartialEq, Eq)]
155pub enum LexErrorKind {
156    UnknownChar(char),
157    UnterminatedString,
158    UnterminatedQuotedIdent,
159    UnterminatedBlockComment,
160    BadNumber(String),
161}
162
163#[derive(Debug, Clone, PartialEq, Eq)]
164pub struct LexError {
165    pub kind: LexErrorKind,
166    pub pos: usize,
167}
168
169impl fmt::Display for LexError {
170    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
171        match &self.kind {
172            LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
173            LexErrorKind::UnterminatedString => {
174                write!(f, "unterminated string literal at byte {}", self.pos)
175            }
176            LexErrorKind::UnterminatedQuotedIdent => {
177                write!(f, "unterminated quoted identifier at byte {}", self.pos)
178            }
179            LexErrorKind::UnterminatedBlockComment => {
180                write!(f, "unterminated /* */ comment at byte {}", self.pos)
181            }
182            LexErrorKind::BadNumber(s) => {
183                write!(f, "invalid number literal {s:?} at byte {}", self.pos)
184            }
185        }
186    }
187}
188
189/// Tokenize `input` into a `Vec<Token>` ending in `Token::Eof`.
190#[allow(clippy::too_many_lines)] // big match — splitting would obscure the dispatch table
191pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
192    let bytes = input.as_bytes();
193    let mut i = 0usize;
194    let mut out = Vec::new();
195
196    while i < bytes.len() {
197        let b = bytes[i];
198        match b {
199            b' ' | b'\t' | b'\n' | b'\r' => {
200                i += 1;
201            }
202            b'-' if peek_eq(bytes, i + 1, b'-') => {
203                i += 2;
204                while i < bytes.len() && bytes[i] != b'\n' {
205                    i += 1;
206                }
207            }
208            b'/' if peek_eq(bytes, i + 1, b'*') => {
209                let start = i;
210                i += 2;
211                let mut closed = false;
212                while i + 1 < bytes.len() {
213                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
214                        i += 2;
215                        closed = true;
216                        break;
217                    }
218                    i += 1;
219                }
220                if !closed {
221                    return Err(LexError {
222                        kind: LexErrorKind::UnterminatedBlockComment,
223                        pos: start,
224                    });
225                }
226            }
227            b'\'' => {
228                let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
229                out.push(tok);
230                i += consumed;
231            }
232            b'"' => {
233                let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
234                out.push(tok);
235                i += consumed;
236            }
237            // MySQL-flavoured backtick-quoted identifier. Same semantics
238            // as the standard `"..."` form, including embedded "``" as
239            // a literal backtick.
240            b'`' => {
241                let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
242                out.push(tok);
243                i += consumed;
244            }
245            b if b.is_ascii_alphabetic() || b == b'_' => {
246                let start = i;
247                i += 1;
248                while i < bytes.len() {
249                    let c = bytes[i];
250                    if c.is_ascii_alphanumeric() || c == b'_' {
251                        i += 1;
252                    } else {
253                        break;
254                    }
255                }
256                let raw = &input[start..i];
257                // v3.0.5: try the keyword table case-insensitively
258                // without allocating; only the ident fall-through
259                // pays for a lowercase String.
260                out.push(keyword_or_ident_raw(raw));
261            }
262            b if b.is_ascii_digit() => {
263                let (tok, consumed) =
264                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
265                out.push(tok);
266                i += consumed;
267            }
268            b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
269                let (tok, consumed) =
270                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
271                out.push(tok);
272                i += consumed;
273            }
274            b'+' => single(&mut out, Token::Plus, &mut i),
275            b'-' => {
276                // v4.14: `->>` and `->` for JSON path access. `->>`
277                // must be tried before `->` (longest match).
278                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
279                    out.push(Token::JsonGetText);
280                    i += 3;
281                } else if peek_eq(bytes, i + 1, b'>') {
282                    out.push(Token::JsonGet);
283                    i += 2;
284                } else {
285                    single(&mut out, Token::Minus, &mut i);
286                }
287            }
288            // v6.4.5: `#>>` and `#>` JSON path walk.
289            b'#' => {
290                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
291                    out.push(Token::JsonGetPathText);
292                    i += 3;
293                } else if peek_eq(bytes, i + 1, b'>') {
294                    out.push(Token::JsonGetPath);
295                    i += 2;
296                } else {
297                    return Err(LexError {
298                        kind: LexErrorKind::UnknownChar('#'),
299                        pos: i,
300                    });
301                }
302            }
303            // v6.4.5: `@>` JSON containment.
304            b'@' => {
305                if peek_eq(bytes, i + 1, b'>') {
306                    out.push(Token::JsonContains);
307                    i += 2;
308                } else {
309                    return Err(LexError {
310                        kind: LexErrorKind::UnknownChar('@'),
311                        pos: i,
312                    });
313                }
314            }
315            b'*' => single(&mut out, Token::Star, &mut i),
316            b'/' => single(&mut out, Token::Slash, &mut i),
317            b'(' => single(&mut out, Token::LParen, &mut i),
318            b')' => single(&mut out, Token::RParen, &mut i),
319            b'[' => single(&mut out, Token::LBracket, &mut i),
320            b']' => single(&mut out, Token::RBracket, &mut i),
321            b',' => single(&mut out, Token::Comma, &mut i),
322            b';' => single(&mut out, Token::Semicolon, &mut i),
323            b'.' => single(&mut out, Token::Dot, &mut i),
324            b'=' => single(&mut out, Token::Eq, &mut i),
325            b'<' => {
326                if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
327                    out.push(Token::CosineDistance);
328                    i += 3;
329                } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
330                    out.push(Token::InnerProduct);
331                    i += 3;
332                } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
333                    out.push(Token::L2Distance);
334                    i += 3;
335                } else if peek_eq(bytes, i + 1, b'=') {
336                    out.push(Token::LtEq);
337                    i += 2;
338                } else if peek_eq(bytes, i + 1, b'>') {
339                    out.push(Token::NotEq);
340                    i += 2;
341                } else {
342                    out.push(Token::Lt);
343                    i += 1;
344                }
345            }
346            b':' if peek_eq(bytes, i + 1, b':') => {
347                out.push(Token::DoubleColon);
348                i += 2;
349            }
350            b'|' if peek_eq(bytes, i + 1, b'|') => {
351                out.push(Token::Concat);
352                i += 2;
353            }
354            b'>' => {
355                if peek_eq(bytes, i + 1, b'=') {
356                    out.push(Token::GtEq);
357                    i += 2;
358                } else {
359                    out.push(Token::Gt);
360                    i += 1;
361                }
362            }
363            b'!' if peek_eq(bytes, i + 1, b'=') => {
364                out.push(Token::NotEq);
365                i += 2;
366            }
367            // v7.9.27 — PG dollar-quoted string `$$ … $$` (or
368            // `$tag$ … $tag$`). Used in `DO $$ … $$ LANGUAGE
369            // plpgsql;` blocks that pg_dump emits for idempotent
370            // migrations. SPG has no PL/pgSQL, so the lexer
371            // consumes the entire string as a single Token::String
372            // and the parser treats the surrounding `DO …;` as a
373            // no-op. mailrs follow-up H1.
374            b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
375                // Empty tag form: `$$ … $$`.
376                let end = find_dollar_tag_end(bytes, i + 2, b"$$");
377                let body = match end {
378                    Some(e) => &input[i + 2..e],
379                    None => {
380                        return Err(LexError {
381                            kind: LexErrorKind::UnterminatedString,
382                            pos: i,
383                        });
384                    }
385                };
386                out.push(Token::String(body.to_string()));
387                i = end.unwrap() + 2;
388            }
389            b'$' if i + 1 < bytes.len()
390                && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
391            {
392                // Tagged form: `$foo$ … $foo$`. Scan the tag
393                // ident, find the closing copy.
394                let mut j = i + 1;
395                while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
396                    j += 1;
397                }
398                if j >= bytes.len() || bytes[j] != b'$' {
399                    // Not a dollar-quoted string — fall through
400                    // to the generic-unknown-char path.
401                    let ch = input[i..].chars().next().unwrap_or('?');
402                    return Err(LexError {
403                        kind: LexErrorKind::UnknownChar(ch),
404                        pos: i,
405                    });
406                }
407                let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
408                let end = find_dollar_tag_end(bytes, j + 1, &close);
409                let body = match end {
410                    Some(e) => &input[j + 1..e],
411                    None => {
412                        return Err(LexError {
413                            kind: LexErrorKind::UnterminatedString,
414                            pos: i,
415                        });
416                    }
417                };
418                out.push(Token::String(body.to_string()));
419                i = end.unwrap() + close.len();
420            }
421            // v6.1.1: `$N` parameter placeholder for the extended
422            // query protocol. PG numbers them 1..=N; we reject $0
423            // and a bare `$` not followed by a digit.
424            b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
425                let mut j = i + 1;
426                let mut n: u32 = 0;
427                while j < bytes.len() && bytes[j].is_ascii_digit() {
428                    n = n
429                        .saturating_mul(10)
430                        .saturating_add(u32::from(bytes[j] - b'0'));
431                    j += 1;
432                }
433                if n == 0 || n > u32::from(u16::MAX) {
434                    return Err(LexError {
435                        kind: LexErrorKind::BadNumber(input[i..j].to_string()),
436                        pos: i,
437                    });
438                }
439                #[allow(clippy::cast_possible_truncation)]
440                out.push(Token::Placeholder(n as u16));
441                i = j;
442            }
443            _ => {
444                let ch = input[i..].chars().next().unwrap_or('?');
445                return Err(LexError {
446                    kind: LexErrorKind::UnknownChar(ch),
447                    pos: i,
448                });
449            }
450        }
451    }
452    out.push(Token::Eof);
453    Ok(out)
454}
455
456fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
457    bytes.get(i) == Some(&target)
458}
459
460/// v7.9.27 — find the start index of the next occurrence of `tag`
461/// (e.g. `b"$$"` or `b"$foo$"`) in `bytes` starting at `from`.
462fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
463    if tag.is_empty() || from > bytes.len() {
464        return None;
465    }
466    let mut i = from;
467    while i + tag.len() <= bytes.len() {
468        if &bytes[i..i + tag.len()] == tag {
469            return Some(i);
470        }
471        i += 1;
472    }
473    None
474}
475
476fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
477    bytes.get(i).is_some_and(pred)
478}
479
480fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
481    out.push(tok);
482    *i += 1;
483}
484
485/// Length-first ASCII-CI keyword lookup. Avoids allocating a
486/// lowercase `String` when the input matches a keyword; only the ident
487/// fall-through path pays for the lowercase copy.
488///
489/// Grouped by length so the outer `match` becomes a small jump table.
490/// Within a length bucket every keyword has either a unique first
491/// byte (cheap dispatch) or a small set of disambiguating
492/// trailing-byte comparisons. All comparisons are ASCII-CI (XOR
493/// 0x20 on each byte before the compare).
494fn keyword_or_ident_raw(raw: &str) -> Token {
495    let b = raw.as_bytes();
496    let tok = match b.len() {
497        2 => kw_len2(b),
498        3 => kw_len3(b),
499        4 => kw_len4(b),
500        5 => kw_len5(b),
501        6 => kw_len6(b),
502        7 => kw_len7(b),
503        8 => kw_len8(b),
504        9 => kw_len9(b),
505        10 => kw_len10(b),
506        11 => kw_len11(b),
507        12 => kw_len12(b),
508        _ => None,
509    };
510    match tok {
511        Some(t) => t,
512        // Ident fall-through: this is the only path that allocates.
513        None => Token::Ident(raw.to_ascii_lowercase()),
514    }
515}
516
517/// ASCII-CI equality on a byte slice against a lowercase literal.
518/// Letters that differ only in case satisfy `(a ^ b) == 0x20`; other
519/// mismatches set bits outside the 0x20 mask. We compare each byte
520/// against its lowercase form via `to_ascii_lowercase` for clarity;
521/// the compiler folds the loop into a tight cmov chain.
522#[inline]
523fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
524    if input.len() != lower.len() {
525        return false;
526    }
527    for i in 0..lower.len() {
528        if input[i].to_ascii_lowercase() != lower[i] {
529            return false;
530        }
531    }
532    true
533}
534
535#[inline]
536fn kw_len2(b: &[u8]) -> Option<Token> {
537    // 7 keywords: as, by, in, is, on, or, to
538    if eq_ci(b, b"as") {
539        return Some(Token::As);
540    }
541    if eq_ci(b, b"by") {
542        return Some(Token::By);
543    }
544    if eq_ci(b, b"in") {
545        return Some(Token::In);
546    }
547    if eq_ci(b, b"is") {
548        return Some(Token::Is);
549    }
550    if eq_ci(b, b"on") {
551        return Some(Token::On);
552    }
553    if eq_ci(b, b"or") {
554        return Some(Token::Or);
555    }
556    if eq_ci(b, b"to") {
557        return Some(Token::To);
558    }
559    None
560}
561
562#[inline]
563fn kw_len3(b: &[u8]) -> Option<Token> {
564    // 5 keywords: all, and, asc, not, for
565    if eq_ci(b, b"for") {
566        return Some(Token::For);
567    }
568    if eq_ci(b, b"all") {
569        return Some(Token::All);
570    }
571    if eq_ci(b, b"and") {
572        return Some(Token::And);
573    }
574    if eq_ci(b, b"asc") {
575        return Some(Token::Asc);
576    }
577    if eq_ci(b, b"not") {
578        return Some(Token::Not);
579    }
580    None
581}
582
583#[inline]
584fn kw_len4(b: &[u8]) -> Option<Token> {
585    // 10 keywords: from, null, true, into, like, join, left, show, desc, drop
586    if eq_ci(b, b"from") {
587        return Some(Token::From);
588    }
589    if eq_ci(b, b"drop") {
590        return Some(Token::Drop);
591    }
592    if eq_ci(b, b"null") {
593        return Some(Token::Null);
594    }
595    if eq_ci(b, b"true") {
596        return Some(Token::True);
597    }
598    if eq_ci(b, b"into") {
599        return Some(Token::Into);
600    }
601    if eq_ci(b, b"like") {
602        return Some(Token::Like);
603    }
604    if eq_ci(b, b"join") {
605        return Some(Token::Join);
606    }
607    if eq_ci(b, b"left") {
608        return Some(Token::Left);
609    }
610    if eq_ci(b, b"show") {
611        return Some(Token::Show);
612    }
613    if eq_ci(b, b"desc") {
614        return Some(Token::Desc);
615    }
616    None
617}
618
619#[inline]
620fn kw_len5(b: &[u8]) -> Option<Token> {
621    // 12 keywords: false, where, table, index, begin, order, limit,
622    // group, union, inner, cross, outer
623    if eq_ci(b, b"false") {
624        return Some(Token::False);
625    }
626    if eq_ci(b, b"where") {
627        return Some(Token::Where);
628    }
629    if eq_ci(b, b"table") {
630        return Some(Token::Table);
631    }
632    if eq_ci(b, b"index") {
633        return Some(Token::Index);
634    }
635    if eq_ci(b, b"begin") {
636        return Some(Token::Begin);
637    }
638    if eq_ci(b, b"order") {
639        return Some(Token::Order);
640    }
641    if eq_ci(b, b"limit") {
642        return Some(Token::Limit);
643    }
644    if eq_ci(b, b"group") {
645        return Some(Token::Group);
646    }
647    if eq_ci(b, b"union") {
648        return Some(Token::Union);
649    }
650    if eq_ci(b, b"inner") {
651        return Some(Token::Inner);
652    }
653    if eq_ci(b, b"cross") {
654        return Some(Token::Cross);
655    }
656    if eq_ci(b, b"outer") {
657        return Some(Token::Outer);
658    }
659    None
660}
661
662#[inline]
663fn kw_len6(b: &[u8]) -> Option<Token> {
664    // 9 keywords: select, create, insert, values, commit, having, offset, tables, except
665    if eq_ci(b, b"select") {
666        return Some(Token::Select);
667    }
668    if eq_ci(b, b"tables") {
669        return Some(Token::Tables);
670    }
671    if eq_ci(b, b"except") {
672        return Some(Token::Except);
673    }
674    if eq_ci(b, b"create") {
675        return Some(Token::Create);
676    }
677    if eq_ci(b, b"insert") {
678        return Some(Token::Insert);
679    }
680    if eq_ci(b, b"values") {
681        return Some(Token::Values);
682    }
683    if eq_ci(b, b"commit") {
684        return Some(Token::Commit);
685    }
686    if eq_ci(b, b"having") {
687        return Some(Token::Having);
688    }
689    if eq_ci(b, b"offset") {
690        return Some(Token::Offset);
691    }
692    None
693}
694
695#[inline]
696fn kw_len7(b: &[u8]) -> Option<Token> {
697    // 4 keywords: between, default, release, extract
698    if eq_ci(b, b"between") {
699        return Some(Token::Between);
700    }
701    if eq_ci(b, b"default") {
702        return Some(Token::Default);
703    }
704    if eq_ci(b, b"release") {
705        return Some(Token::Release);
706    }
707    if eq_ci(b, b"extract") {
708        return Some(Token::Extract);
709    }
710    None
711}
712
713#[inline]
714fn kw_len8(b: &[u8]) -> Option<Token> {
715    // 3 keywords: rollback, distinct, interval
716    if eq_ci(b, b"rollback") {
717        return Some(Token::Rollback);
718    }
719    if eq_ci(b, b"distinct") {
720        return Some(Token::Distinct);
721    }
722    if eq_ci(b, b"interval") {
723        return Some(Token::Interval);
724    }
725    None
726}
727
728#[inline]
729fn kw_len9(b: &[u8]) -> Option<Token> {
730    // 1 keyword: savepoint
731    if eq_ci(b, b"savepoint") {
732        return Some(Token::Savepoint);
733    }
734    None
735}
736
737#[inline]
738fn kw_len10(b: &[u8]) -> Option<Token> {
739    // 1 keyword: connection
740    if eq_ci(b, b"connection") {
741        return Some(Token::Connection);
742    }
743    None
744}
745
746#[inline]
747fn kw_len11(b: &[u8]) -> Option<Token> {
748    // 1 keyword: publication
749    if eq_ci(b, b"publication") {
750        return Some(Token::Publication);
751    }
752    None
753}
754
755#[inline]
756fn kw_len12(b: &[u8]) -> Option<Token> {
757    // 1 keyword: subscription
758    if eq_ci(b, b"subscription") {
759        return Some(Token::Subscription);
760    }
761    None
762}
763
764/// Lex a `'...'` string literal or `"..."` quoted identifier. The opening
765/// quote sits at `input[start]`; `quote` is its byte value. `is_ident` selects
766/// the resulting token shape.
767///
768/// PG-style doubling escapes the quote: `''` inside `'...'` is a literal `'`,
769/// same for `""` inside `"..."`.
770fn lex_quoted(
771    input: &str,
772    start: usize,
773    quote: u8,
774    is_ident: bool,
775) -> Result<(Token, usize), LexError> {
776    let bytes = input.as_bytes();
777    let mut i = start + 1;
778    let mut s = String::new();
779    loop {
780        if i >= bytes.len() {
781            return Err(LexError {
782                kind: if is_ident {
783                    LexErrorKind::UnterminatedQuotedIdent
784                } else {
785                    LexErrorKind::UnterminatedString
786                },
787                pos: start,
788            });
789        }
790        if bytes[i] == quote {
791            if peek_eq(bytes, i + 1, quote) {
792                s.push(quote as char);
793                i += 2;
794            } else {
795                i += 1;
796                break;
797            }
798        } else {
799            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
800            s.push(ch);
801            i += ch.len_utf8();
802        }
803    }
804    let tok = if is_ident {
805        Token::QuotedIdent(s)
806    } else {
807        Token::String(s)
808    };
809    Ok((tok, i - start))
810}
811
812fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
813    let bytes = s.as_bytes();
814    let mut i = 0usize;
815    let mut is_float = false;
816
817    while i < bytes.len() && bytes[i].is_ascii_digit() {
818        i += 1;
819    }
820    if i < bytes.len() && bytes[i] == b'.' {
821        is_float = true;
822        i += 1;
823        while i < bytes.len() && bytes[i].is_ascii_digit() {
824            i += 1;
825        }
826    }
827    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
828        is_float = true;
829        i += 1;
830        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
831            i += 1;
832        }
833        let exp_start = i;
834        while i < bytes.len() && bytes[i].is_ascii_digit() {
835            i += 1;
836        }
837        if exp_start == i {
838            return Err(LexErrorKind::BadNumber(s[..i].to_string()));
839        }
840    }
841
842    let lit = &s[..i];
843    if is_float {
844        lit.parse::<f64>()
845            .map(|v| (Token::Float(v), i))
846            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
847    } else {
848        lit.parse::<i64>()
849            .map(|v| (Token::Integer(v), i))
850            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
851    }
852}
853
854#[cfg(test)]
855mod tests {
856    use super::*;
857    use alloc::vec;
858
859    fn lex(s: &str) -> Vec<Token> {
860        tokenize(s).expect("lex ok")
861    }
862
863    #[test]
864    fn empty_yields_only_eof() {
865        assert_eq!(lex(""), vec![Token::Eof]);
866    }
867
868    #[test]
869    fn whitespace_only_yields_only_eof() {
870        assert_eq!(lex("   \t\n  "), vec![Token::Eof]);
871    }
872
873    #[test]
874    fn keywords_are_case_insensitive() {
875        assert_eq!(
876            lex("SELECT select Select"),
877            vec![Token::Select, Token::Select, Token::Select, Token::Eof]
878        );
879    }
880
881    #[test]
882    fn identifiers_lowercase_ascii() {
883        assert_eq!(
884            lex("hello WORLD _x x1"),
885            vec![
886                Token::Ident("hello".into()),
887                Token::Ident("world".into()),
888                Token::Ident("_x".into()),
889                Token::Ident("x1".into()),
890                Token::Eof,
891            ]
892        );
893    }
894
895    #[test]
896    fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
897        assert_eq!(
898            lex(r#""User Name" "a""b""#),
899            vec![
900                Token::QuotedIdent("User Name".into()),
901                Token::QuotedIdent("a\"b".into()),
902                Token::Eof,
903            ]
904        );
905    }
906
907    #[test]
908    fn integer_and_float_literals() {
909        assert_eq!(
910            lex("0 42 1.5 .5 1e10 2.5e-3"),
911            vec![
912                Token::Integer(0),
913                Token::Integer(42),
914                Token::Float(1.5),
915                Token::Float(0.5),
916                Token::Float(1e10),
917                Token::Float(2.5e-3),
918                Token::Eof,
919            ]
920        );
921    }
922
923    #[test]
924    fn negative_number_is_minus_then_integer() {
925        // PG follows this: unary minus is a separate token, parser folds it.
926        assert_eq!(
927            lex("-42"),
928            vec![Token::Minus, Token::Integer(42), Token::Eof]
929        );
930    }
931
932    #[test]
933    fn string_literal_doubled_quote_escape() {
934        assert_eq!(
935            lex("'hello' 'it''s'"),
936            vec![
937                Token::String("hello".into()),
938                Token::String("it's".into()),
939                Token::Eof,
940            ]
941        );
942    }
943
944    #[test]
945    fn all_comparison_and_arithmetic_operators() {
946        assert_eq!(
947            lex("= <> != < <= > >= + - * /"),
948            vec![
949                Token::Eq,
950                Token::NotEq,
951                Token::NotEq,
952                Token::Lt,
953                Token::LtEq,
954                Token::Gt,
955                Token::GtEq,
956                Token::Plus,
957                Token::Minus,
958                Token::Star,
959                Token::Slash,
960                Token::Eof,
961            ]
962        );
963    }
964
965    #[test]
966    fn punctuation() {
967        assert_eq!(
968            lex("( ) , ; ."),
969            vec![
970                Token::LParen,
971                Token::RParen,
972                Token::Comma,
973                Token::Semicolon,
974                Token::Dot,
975                Token::Eof,
976            ]
977        );
978    }
979
980    #[test]
981    fn line_comment_skipped() {
982        assert_eq!(
983            lex("SELECT -- trailing junk\nFROM"),
984            vec![Token::Select, Token::From, Token::Eof]
985        );
986    }
987
988    #[test]
989    fn block_comment_skipped() {
990        assert_eq!(
991            lex("SELECT /* skipped */ 1"),
992            vec![Token::Select, Token::Integer(1), Token::Eof]
993        );
994    }
995
996    #[test]
997    fn unterminated_string_errors() {
998        let err = tokenize("'oops").unwrap_err();
999        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1000        assert_eq!(err.pos, 0);
1001    }
1002
1003    #[test]
1004    fn unterminated_block_comment_errors() {
1005        let err = tokenize("/* never closed").unwrap_err();
1006        assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1007    }
1008
1009    #[test]
1010    fn unknown_char_errors() {
1011        let err = tokenize("@").unwrap_err();
1012        assert!(matches!(err.kind, LexErrorKind::UnknownChar('@')));
1013    }
1014
1015    #[test]
1016    fn dot_in_qualified_column() {
1017        assert_eq!(
1018            lex("t.col"),
1019            vec![
1020                Token::Ident("t".into()),
1021                Token::Dot,
1022                Token::Ident("col".into()),
1023                Token::Eof,
1024            ]
1025        );
1026    }
1027
1028    // --- v0.11 brackets + distance op + vector keyword --------------------
1029
1030    #[test]
1031    fn brackets_are_distinct_tokens() {
1032        assert_eq!(
1033            lex("[ ]"),
1034            vec![Token::LBracket, Token::RBracket, Token::Eof]
1035        );
1036    }
1037
1038    #[test]
1039    fn l2_distance_is_three_char_token() {
1040        assert_eq!(
1041            lex("a <-> b"),
1042            vec![
1043                Token::Ident("a".into()),
1044                Token::L2Distance,
1045                Token::Ident("b".into()),
1046                Token::Eof,
1047            ]
1048        );
1049        // Bare `<-` should NOT match L2Distance.
1050        assert_eq!(
1051            lex("a <- b"),
1052            vec![
1053                Token::Ident("a".into()),
1054                Token::Lt,
1055                Token::Minus,
1056                Token::Ident("b".into()),
1057                Token::Eof,
1058            ]
1059        );
1060    }
1061
1062    #[test]
1063    fn order_by_limit_are_keywords() {
1064        assert_eq!(
1065            lex("ORDER BY LIMIT"),
1066            vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1067        );
1068    }
1069
1070    // --- v1.2: pgvector distance ops + PG cast --------------------------
1071
1072    #[test]
1073    fn inner_product_operator_3char() {
1074        assert_eq!(
1075            lex("a <#> b"),
1076            vec![
1077                Token::Ident("a".into()),
1078                Token::InnerProduct,
1079                Token::Ident("b".into()),
1080                Token::Eof,
1081            ]
1082        );
1083    }
1084
1085    #[test]
1086    fn cosine_distance_operator_3char() {
1087        assert_eq!(
1088            lex("a <=> b"),
1089            vec![
1090                Token::Ident("a".into()),
1091                Token::CosineDistance,
1092                Token::Ident("b".into()),
1093                Token::Eof,
1094            ]
1095        );
1096        // Make sure `<=` and `<>` and `<->` still lex right when `<=>` is
1097        // around (greedy match takes the longest).
1098        assert_eq!(
1099            lex("a <= b"),
1100            vec![
1101                Token::Ident("a".into()),
1102                Token::LtEq,
1103                Token::Ident("b".into()),
1104                Token::Eof,
1105            ]
1106        );
1107    }
1108
1109    #[test]
1110    fn double_colon_cast_token() {
1111        assert_eq!(
1112            lex("x::INT"),
1113            vec![
1114                Token::Ident("x".into()),
1115                Token::DoubleColon,
1116                Token::Ident("int".into()),
1117                Token::Eof,
1118            ]
1119        );
1120    }
1121
1122    #[test]
1123    fn lone_single_colon_is_unknown_char() {
1124        let err = tokenize(":x").unwrap_err();
1125        assert!(matches!(err.kind, LexErrorKind::UnknownChar(':')));
1126    }
1127}
spg_sql/lexer.rs

spg_sql/
lexer.rs