spg_sql/
lexer.rs

1//! Lexer for the PG-dialect subset that SPG accepts.
2//!
3//! v0.2 token stream is value-only — no source spans yet. Errors do report
4//! the byte offset where the offending construct started. Identifiers are
5//! ASCII case-folded to lower-case (matches PG when un-quoted). Quoted
6//! identifiers (`"..."`) preserve case; `""` is an embedded quote.
7//! String literals (`'...'`) follow PG single-quote convention with `''`
8//! as the embedded quote. The lexer accepts but does not interpret E-strings
9//! or dollar-quoted strings — those land in a later milestone.
10
11use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords
18    Select,
19    From,
20    Where,
21    As,
22    Null,
23    True,
24    False,
25    And,
26    Or,
27    Not,
28    Create,
29    Table,
30    Insert,
31    Into,
32    Values,
33    Index,
34    On,
35    Begin,
36    Commit,
37    Rollback,
38    Order,
39    By,
40    Limit,
41
42    // Identifiers
43    Ident(String),       // ASCII case-folded
44    QuotedIdent(String), // original case, "" → "
45
46    // Literals
47    Integer(i64),
48    Float(f64),
49    String(String),
50
51    // Operators
52    Plus,
53    Minus,
54    Star,
55    Slash,
56    Eq,
57    NotEq,
58    Lt,
59    LtEq,
60    Gt,
61    GtEq,
62
63    // Punctuation
64    LParen,
65    RParen,
66    LBracket,
67    RBracket,
68    Comma,
69    Semicolon,
70    Dot,
71    /// pgvector L2 distance operator `<->`. Lexed as one token so the
72    /// parser can give it its own precedence rung.
73    /// v4.14 `->` — JSON object/array element access, returns json.
74    JsonGet,
75    /// v4.14 `->>` — same access, returns text.
76    JsonGetText,
77    /// v6.4.5 `#>` — JSON path walk, returns json. Path is the
78    /// right-hand TEXT with PG `{a,b,0}` syntax.
79    JsonGetPath,
80    /// v6.4.5 `#>>` — same walk, returns text.
81    JsonGetPathText,
82    /// v6.4.5 `@>` — JSON containment. `j @> sub` returns true if
83    /// every key/value in `sub` is present in `j` with structural
84    /// containment for objects + arrays.
85    JsonContains,
86    L2Distance,
87    /// pgvector inner-product operator `<#>` (returns negative dot product
88    /// so smaller still means more similar — same semantics as pgvector).
89    InnerProduct,
90    /// pgvector cosine distance operator `<=>`.
91    CosineDistance,
92    /// PG-style cast `expr::type` — single token because we want it to bind
93    /// at postfix precedence.
94    DoubleColon,
95    /// Standard SQL string concatenation `||`.
96    Concat,
97    /// `IS` keyword — postfix `IS NULL` / `IS NOT NULL` predicates.
98    Is,
99    Between,
100    In,
101    Like,
102    Group,
103    Distinct,
104    Union,
105    All,
106    Join,
107    Inner,
108    Left,
109    Cross,
110    Outer,
111    Default,
112    Savepoint,
113    Release,
114    To,
115    Having,
116    Show,
117    Extract,
118    Offset,
119    Asc,
120    Desc,
121    /// `INTERVAL` — followed by a string literal carrying the span text
122    /// (e.g. `INTERVAL '1 day 2 hours'`).
123    Interval,
124    /// v6.1.1 — `$N` parameter placeholder for the extended query
125    /// protocol. The number N is 1-based per PostgreSQL convention.
126    /// `0` and `$0` are not valid; the lexer rejects them.
127    Placeholder(u16),
128
129    /// v6.1.2 — `DROP` keyword. Used by `DROP PUBLICATION <name>`.
130    /// Reserved for future `DROP TABLE` / `DROP INDEX` / `DROP USER`
131    /// surface that currently goes through SHOW-shaped admin SQL.
132    Drop,
133    /// v6.1.2 — `FOR` keyword (publication scope).
134    For,
135    /// v6.1.2 — `TABLES` plural keyword (`FOR ALL TABLES`,
136    /// `FOR ALL TABLES EXCEPT …`). The existing `TABLE` keyword
137    /// stays a separate token so `CREATE TABLE`'s single-table
138    /// form keeps lexing as today.
139    Tables,
140    /// v6.1.3 (reserved at v6.1.2 to keep the AST shape stable) —
141    /// `EXCEPT` keyword for `FOR ALL TABLES EXCEPT t1, t2`.
142    Except,
143    /// v6.1.2 — `PUBLICATION` keyword.
144    Publication,
145    /// v6.1.4 (reserved at v6.1.2) — `SUBSCRIPTION` keyword.
146    Subscription,
147    /// v6.1.4 — `CONNECTION` keyword (for
148    /// `CREATE SUBSCRIPTION … CONNECTION '<conn_str>' …`).
149    Connection,
150
151    Eof,
152}
153
154#[derive(Debug, Clone, PartialEq, Eq)]
155pub enum LexErrorKind {
156    UnknownChar(char),
157    UnterminatedString,
158    UnterminatedQuotedIdent,
159    UnterminatedBlockComment,
160    BadNumber(String),
161}
162
163#[derive(Debug, Clone, PartialEq, Eq)]
164pub struct LexError {
165    pub kind: LexErrorKind,
166    pub pos: usize,
167}
168
169impl fmt::Display for LexError {
170    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
171        match &self.kind {
172            LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
173            LexErrorKind::UnterminatedString => {
174                write!(f, "unterminated string literal at byte {}", self.pos)
175            }
176            LexErrorKind::UnterminatedQuotedIdent => {
177                write!(f, "unterminated quoted identifier at byte {}", self.pos)
178            }
179            LexErrorKind::UnterminatedBlockComment => {
180                write!(f, "unterminated /* */ comment at byte {}", self.pos)
181            }
182            LexErrorKind::BadNumber(s) => {
183                write!(f, "invalid number literal {s:?} at byte {}", self.pos)
184            }
185        }
186    }
187}
188
189/// Tokenize `input` into a `Vec<Token>` ending in `Token::Eof`.
190#[allow(clippy::too_many_lines)] // big match — splitting would obscure the dispatch table
191pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
192    let bytes = input.as_bytes();
193    let mut i = 0usize;
194    let mut out = Vec::new();
195
196    while i < bytes.len() {
197        let b = bytes[i];
198        match b {
199            b' ' | b'\t' | b'\n' | b'\r' => {
200                i += 1;
201            }
202            b'-' if peek_eq(bytes, i + 1, b'-') => {
203                i += 2;
204                while i < bytes.len() && bytes[i] != b'\n' {
205                    i += 1;
206                }
207            }
208            b'/' if peek_eq(bytes, i + 1, b'*') => {
209                let start = i;
210                i += 2;
211                let mut closed = false;
212                while i + 1 < bytes.len() {
213                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
214                        i += 2;
215                        closed = true;
216                        break;
217                    }
218                    i += 1;
219                }
220                if !closed {
221                    return Err(LexError {
222                        kind: LexErrorKind::UnterminatedBlockComment,
223                        pos: start,
224                    });
225                }
226            }
227            b'\'' => {
228                let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
229                out.push(tok);
230                i += consumed;
231            }
232            b'"' => {
233                let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
234                out.push(tok);
235                i += consumed;
236            }
237            // MySQL-flavoured backtick-quoted identifier. Same semantics
238            // as the standard `"..."` form, including embedded "``" as
239            // a literal backtick.
240            b'`' => {
241                let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
242                out.push(tok);
243                i += consumed;
244            }
245            b if b.is_ascii_alphabetic() || b == b'_' => {
246                let start = i;
247                i += 1;
248                while i < bytes.len() {
249                    let c = bytes[i];
250                    if c.is_ascii_alphanumeric() || c == b'_' {
251                        i += 1;
252                    } else {
253                        break;
254                    }
255                }
256                let raw = &input[start..i];
257                // v3.0.5: try the keyword table case-insensitively
258                // without allocating; only the ident fall-through
259                // pays for a lowercase String.
260                out.push(keyword_or_ident_raw(raw));
261            }
262            b if b.is_ascii_digit() => {
263                let (tok, consumed) =
264                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
265                out.push(tok);
266                i += consumed;
267            }
268            b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
269                let (tok, consumed) =
270                    lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
271                out.push(tok);
272                i += consumed;
273            }
274            b'+' => single(&mut out, Token::Plus, &mut i),
275            b'-' => {
276                // v4.14: `->>` and `->` for JSON path access. `->>`
277                // must be tried before `->` (longest match).
278                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
279                    out.push(Token::JsonGetText);
280                    i += 3;
281                } else if peek_eq(bytes, i + 1, b'>') {
282                    out.push(Token::JsonGet);
283                    i += 2;
284                } else {
285                    single(&mut out, Token::Minus, &mut i);
286                }
287            }
288            // v6.4.5: `#>>` and `#>` JSON path walk.
289            b'#' => {
290                if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
291                    out.push(Token::JsonGetPathText);
292                    i += 3;
293                } else if peek_eq(bytes, i + 1, b'>') {
294                    out.push(Token::JsonGetPath);
295                    i += 2;
296                } else {
297                    return Err(LexError {
298                        kind: LexErrorKind::UnknownChar('#'),
299                        pos: i,
300                    });
301                }
302            }
303            // v6.4.5: `@>` JSON containment.
304            b'@' => {
305                if peek_eq(bytes, i + 1, b'>') {
306                    out.push(Token::JsonContains);
307                    i += 2;
308                } else {
309                    return Err(LexError {
310                        kind: LexErrorKind::UnknownChar('@'),
311                        pos: i,
312                    });
313                }
314            }
315            b'*' => single(&mut out, Token::Star, &mut i),
316            b'/' => single(&mut out, Token::Slash, &mut i),
317            b'(' => single(&mut out, Token::LParen, &mut i),
318            b')' => single(&mut out, Token::RParen, &mut i),
319            b'[' => single(&mut out, Token::LBracket, &mut i),
320            b']' => single(&mut out, Token::RBracket, &mut i),
321            b',' => single(&mut out, Token::Comma, &mut i),
322            b';' => single(&mut out, Token::Semicolon, &mut i),
323            b'.' => single(&mut out, Token::Dot, &mut i),
324            b'=' => single(&mut out, Token::Eq, &mut i),
325            b'<' => {
326                if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
327                    out.push(Token::CosineDistance);
328                    i += 3;
329                } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
330                    out.push(Token::InnerProduct);
331                    i += 3;
332                } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
333                    out.push(Token::L2Distance);
334                    i += 3;
335                } else if peek_eq(bytes, i + 1, b'=') {
336                    out.push(Token::LtEq);
337                    i += 2;
338                } else if peek_eq(bytes, i + 1, b'>') {
339                    out.push(Token::NotEq);
340                    i += 2;
341                } else {
342                    out.push(Token::Lt);
343                    i += 1;
344                }
345            }
346            b':' if peek_eq(bytes, i + 1, b':') => {
347                out.push(Token::DoubleColon);
348                i += 2;
349            }
350            b'|' if peek_eq(bytes, i + 1, b'|') => {
351                out.push(Token::Concat);
352                i += 2;
353            }
354            b'>' => {
355                if peek_eq(bytes, i + 1, b'=') {
356                    out.push(Token::GtEq);
357                    i += 2;
358                } else {
359                    out.push(Token::Gt);
360                    i += 1;
361                }
362            }
363            b'!' if peek_eq(bytes, i + 1, b'=') => {
364                out.push(Token::NotEq);
365                i += 2;
366            }
367            // v6.1.1: `$N` parameter placeholder for the extended
368            // query protocol. PG numbers them 1..=N; we reject $0
369            // and a bare `$` not followed by a digit. Dollar-quoted
370            // strings ($$ ... $$) are not supported here — they're
371            // a separate lexer feature filed for a future release.
372            b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
373                let mut j = i + 1;
374                let mut n: u32 = 0;
375                while j < bytes.len() && bytes[j].is_ascii_digit() {
376                    n = n.saturating_mul(10).saturating_add(u32::from(bytes[j] - b'0'));
377                    j += 1;
378                }
379                if n == 0 || n > u32::from(u16::MAX) {
380                    return Err(LexError {
381                        kind: LexErrorKind::BadNumber(input[i..j].to_string()),
382                        pos: i,
383                    });
384                }
385                #[allow(clippy::cast_possible_truncation)]
386                out.push(Token::Placeholder(n as u16));
387                i = j;
388            }
389            _ => {
390                let ch = input[i..].chars().next().unwrap_or('?');
391                return Err(LexError {
392                    kind: LexErrorKind::UnknownChar(ch),
393                    pos: i,
394                });
395            }
396        }
397    }
398    out.push(Token::Eof);
399    Ok(out)
400}
401
402fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
403    bytes.get(i) == Some(&target)
404}
405
406fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
407    bytes.get(i).is_some_and(pred)
408}
409
410fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
411    out.push(tok);
412    *i += 1;
413}
414
415/// Length-first ASCII-CI keyword lookup. Avoids allocating a
416/// lowercase `String` when the input matches a keyword; only the ident
417/// fall-through path pays for the lowercase copy.
418///
419/// Grouped by length so the outer `match` becomes a small jump table.
420/// Within a length bucket every keyword has either a unique first
421/// byte (cheap dispatch) or a small set of disambiguating
422/// trailing-byte comparisons. All comparisons are ASCII-CI (XOR
423/// 0x20 on each byte before the compare).
424fn keyword_or_ident_raw(raw: &str) -> Token {
425    let b = raw.as_bytes();
426    let tok = match b.len() {
427        2 => kw_len2(b),
428        3 => kw_len3(b),
429        4 => kw_len4(b),
430        5 => kw_len5(b),
431        6 => kw_len6(b),
432        7 => kw_len7(b),
433        8 => kw_len8(b),
434        9 => kw_len9(b),
435        10 => kw_len10(b),
436        11 => kw_len11(b),
437        12 => kw_len12(b),
438        _ => None,
439    };
440    match tok {
441        Some(t) => t,
442        // Ident fall-through: this is the only path that allocates.
443        None => Token::Ident(raw.to_ascii_lowercase()),
444    }
445}
446
447/// ASCII-CI equality on a byte slice against a lowercase literal.
448/// Letters that differ only in case satisfy `(a ^ b) == 0x20`; other
449/// mismatches set bits outside the 0x20 mask. We compare each byte
450/// against its lowercase form via `to_ascii_lowercase` for clarity;
451/// the compiler folds the loop into a tight cmov chain.
452#[inline]
453fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
454    if input.len() != lower.len() {
455        return false;
456    }
457    for i in 0..lower.len() {
458        if input[i].to_ascii_lowercase() != lower[i] {
459            return false;
460        }
461    }
462    true
463}
464
465#[inline]
466fn kw_len2(b: &[u8]) -> Option<Token> {
467    // 7 keywords: as, by, in, is, on, or, to
468    if eq_ci(b, b"as") {
469        return Some(Token::As);
470    }
471    if eq_ci(b, b"by") {
472        return Some(Token::By);
473    }
474    if eq_ci(b, b"in") {
475        return Some(Token::In);
476    }
477    if eq_ci(b, b"is") {
478        return Some(Token::Is);
479    }
480    if eq_ci(b, b"on") {
481        return Some(Token::On);
482    }
483    if eq_ci(b, b"or") {
484        return Some(Token::Or);
485    }
486    if eq_ci(b, b"to") {
487        return Some(Token::To);
488    }
489    None
490}
491
492#[inline]
493fn kw_len3(b: &[u8]) -> Option<Token> {
494    // 5 keywords: all, and, asc, not, for
495    if eq_ci(b, b"for") {
496        return Some(Token::For);
497    }
498    if eq_ci(b, b"all") {
499        return Some(Token::All);
500    }
501    if eq_ci(b, b"and") {
502        return Some(Token::And);
503    }
504    if eq_ci(b, b"asc") {
505        return Some(Token::Asc);
506    }
507    if eq_ci(b, b"not") {
508        return Some(Token::Not);
509    }
510    None
511}
512
513#[inline]
514fn kw_len4(b: &[u8]) -> Option<Token> {
515    // 10 keywords: from, null, true, into, like, join, left, show, desc, drop
516    if eq_ci(b, b"from") {
517        return Some(Token::From);
518    }
519    if eq_ci(b, b"drop") {
520        return Some(Token::Drop);
521    }
522    if eq_ci(b, b"null") {
523        return Some(Token::Null);
524    }
525    if eq_ci(b, b"true") {
526        return Some(Token::True);
527    }
528    if eq_ci(b, b"into") {
529        return Some(Token::Into);
530    }
531    if eq_ci(b, b"like") {
532        return Some(Token::Like);
533    }
534    if eq_ci(b, b"join") {
535        return Some(Token::Join);
536    }
537    if eq_ci(b, b"left") {
538        return Some(Token::Left);
539    }
540    if eq_ci(b, b"show") {
541        return Some(Token::Show);
542    }
543    if eq_ci(b, b"desc") {
544        return Some(Token::Desc);
545    }
546    None
547}
548
549#[inline]
550fn kw_len5(b: &[u8]) -> Option<Token> {
551    // 12 keywords: false, where, table, index, begin, order, limit,
552    // group, union, inner, cross, outer
553    if eq_ci(b, b"false") {
554        return Some(Token::False);
555    }
556    if eq_ci(b, b"where") {
557        return Some(Token::Where);
558    }
559    if eq_ci(b, b"table") {
560        return Some(Token::Table);
561    }
562    if eq_ci(b, b"index") {
563        return Some(Token::Index);
564    }
565    if eq_ci(b, b"begin") {
566        return Some(Token::Begin);
567    }
568    if eq_ci(b, b"order") {
569        return Some(Token::Order);
570    }
571    if eq_ci(b, b"limit") {
572        return Some(Token::Limit);
573    }
574    if eq_ci(b, b"group") {
575        return Some(Token::Group);
576    }
577    if eq_ci(b, b"union") {
578        return Some(Token::Union);
579    }
580    if eq_ci(b, b"inner") {
581        return Some(Token::Inner);
582    }
583    if eq_ci(b, b"cross") {
584        return Some(Token::Cross);
585    }
586    if eq_ci(b, b"outer") {
587        return Some(Token::Outer);
588    }
589    None
590}
591
592#[inline]
593fn kw_len6(b: &[u8]) -> Option<Token> {
594    // 9 keywords: select, create, insert, values, commit, having, offset, tables, except
595    if eq_ci(b, b"select") {
596        return Some(Token::Select);
597    }
598    if eq_ci(b, b"tables") {
599        return Some(Token::Tables);
600    }
601    if eq_ci(b, b"except") {
602        return Some(Token::Except);
603    }
604    if eq_ci(b, b"create") {
605        return Some(Token::Create);
606    }
607    if eq_ci(b, b"insert") {
608        return Some(Token::Insert);
609    }
610    if eq_ci(b, b"values") {
611        return Some(Token::Values);
612    }
613    if eq_ci(b, b"commit") {
614        return Some(Token::Commit);
615    }
616    if eq_ci(b, b"having") {
617        return Some(Token::Having);
618    }
619    if eq_ci(b, b"offset") {
620        return Some(Token::Offset);
621    }
622    None
623}
624
625#[inline]
626fn kw_len7(b: &[u8]) -> Option<Token> {
627    // 4 keywords: between, default, release, extract
628    if eq_ci(b, b"between") {
629        return Some(Token::Between);
630    }
631    if eq_ci(b, b"default") {
632        return Some(Token::Default);
633    }
634    if eq_ci(b, b"release") {
635        return Some(Token::Release);
636    }
637    if eq_ci(b, b"extract") {
638        return Some(Token::Extract);
639    }
640    None
641}
642
643#[inline]
644fn kw_len8(b: &[u8]) -> Option<Token> {
645    // 3 keywords: rollback, distinct, interval
646    if eq_ci(b, b"rollback") {
647        return Some(Token::Rollback);
648    }
649    if eq_ci(b, b"distinct") {
650        return Some(Token::Distinct);
651    }
652    if eq_ci(b, b"interval") {
653        return Some(Token::Interval);
654    }
655    None
656}
657
658#[inline]
659fn kw_len9(b: &[u8]) -> Option<Token> {
660    // 1 keyword: savepoint
661    if eq_ci(b, b"savepoint") {
662        return Some(Token::Savepoint);
663    }
664    None
665}
666
667#[inline]
668fn kw_len10(b: &[u8]) -> Option<Token> {
669    // 1 keyword: connection
670    if eq_ci(b, b"connection") {
671        return Some(Token::Connection);
672    }
673    None
674}
675
676#[inline]
677fn kw_len11(b: &[u8]) -> Option<Token> {
678    // 1 keyword: publication
679    if eq_ci(b, b"publication") {
680        return Some(Token::Publication);
681    }
682    None
683}
684
685#[inline]
686fn kw_len12(b: &[u8]) -> Option<Token> {
687    // 1 keyword: subscription
688    if eq_ci(b, b"subscription") {
689        return Some(Token::Subscription);
690    }
691    None
692}
693
694/// Lex a `'...'` string literal or `"..."` quoted identifier. The opening
695/// quote sits at `input[start]`; `quote` is its byte value. `is_ident` selects
696/// the resulting token shape.
697///
698/// PG-style doubling escapes the quote: `''` inside `'...'` is a literal `'`,
699/// same for `""` inside `"..."`.
700fn lex_quoted(
701    input: &str,
702    start: usize,
703    quote: u8,
704    is_ident: bool,
705) -> Result<(Token, usize), LexError> {
706    let bytes = input.as_bytes();
707    let mut i = start + 1;
708    let mut s = String::new();
709    loop {
710        if i >= bytes.len() {
711            return Err(LexError {
712                kind: if is_ident {
713                    LexErrorKind::UnterminatedQuotedIdent
714                } else {
715                    LexErrorKind::UnterminatedString
716                },
717                pos: start,
718            });
719        }
720        if bytes[i] == quote {
721            if peek_eq(bytes, i + 1, quote) {
722                s.push(quote as char);
723                i += 2;
724            } else {
725                i += 1;
726                break;
727            }
728        } else {
729            let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
730            s.push(ch);
731            i += ch.len_utf8();
732        }
733    }
734    let tok = if is_ident {
735        Token::QuotedIdent(s)
736    } else {
737        Token::String(s)
738    };
739    Ok((tok, i - start))
740}
741
742fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
743    let bytes = s.as_bytes();
744    let mut i = 0usize;
745    let mut is_float = false;
746
747    while i < bytes.len() && bytes[i].is_ascii_digit() {
748        i += 1;
749    }
750    if i < bytes.len() && bytes[i] == b'.' {
751        is_float = true;
752        i += 1;
753        while i < bytes.len() && bytes[i].is_ascii_digit() {
754            i += 1;
755        }
756    }
757    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
758        is_float = true;
759        i += 1;
760        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
761            i += 1;
762        }
763        let exp_start = i;
764        while i < bytes.len() && bytes[i].is_ascii_digit() {
765            i += 1;
766        }
767        if exp_start == i {
768            return Err(LexErrorKind::BadNumber(s[..i].to_string()));
769        }
770    }
771
772    let lit = &s[..i];
773    if is_float {
774        lit.parse::<f64>()
775            .map(|v| (Token::Float(v), i))
776            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
777    } else {
778        lit.parse::<i64>()
779            .map(|v| (Token::Integer(v), i))
780            .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
781    }
782}
783
784#[cfg(test)]
785mod tests {
786    use super::*;
787    use alloc::vec;
788
789    fn lex(s: &str) -> Vec<Token> {
790        tokenize(s).expect("lex ok")
791    }
792
793    #[test]
794    fn empty_yields_only_eof() {
795        assert_eq!(lex(""), vec![Token::Eof]);
796    }
797
798    #[test]
799    fn whitespace_only_yields_only_eof() {
800        assert_eq!(lex("   \t\n  "), vec![Token::Eof]);
801    }
802
803    #[test]
804    fn keywords_are_case_insensitive() {
805        assert_eq!(
806            lex("SELECT select Select"),
807            vec![Token::Select, Token::Select, Token::Select, Token::Eof]
808        );
809    }
810
811    #[test]
812    fn identifiers_lowercase_ascii() {
813        assert_eq!(
814            lex("hello WORLD _x x1"),
815            vec![
816                Token::Ident("hello".into()),
817                Token::Ident("world".into()),
818                Token::Ident("_x".into()),
819                Token::Ident("x1".into()),
820                Token::Eof,
821            ]
822        );
823    }
824
825    #[test]
826    fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
827        assert_eq!(
828            lex(r#""User Name" "a""b""#),
829            vec![
830                Token::QuotedIdent("User Name".into()),
831                Token::QuotedIdent("a\"b".into()),
832                Token::Eof,
833            ]
834        );
835    }
836
837    #[test]
838    fn integer_and_float_literals() {
839        assert_eq!(
840            lex("0 42 1.5 .5 1e10 2.5e-3"),
841            vec![
842                Token::Integer(0),
843                Token::Integer(42),
844                Token::Float(1.5),
845                Token::Float(0.5),
846                Token::Float(1e10),
847                Token::Float(2.5e-3),
848                Token::Eof,
849            ]
850        );
851    }
852
853    #[test]
854    fn negative_number_is_minus_then_integer() {
855        // PG follows this: unary minus is a separate token, parser folds it.
856        assert_eq!(
857            lex("-42"),
858            vec![Token::Minus, Token::Integer(42), Token::Eof]
859        );
860    }
861
862    #[test]
863    fn string_literal_doubled_quote_escape() {
864        assert_eq!(
865            lex("'hello' 'it''s'"),
866            vec![
867                Token::String("hello".into()),
868                Token::String("it's".into()),
869                Token::Eof,
870            ]
871        );
872    }
873
874    #[test]
875    fn all_comparison_and_arithmetic_operators() {
876        assert_eq!(
877            lex("= <> != < <= > >= + - * /"),
878            vec![
879                Token::Eq,
880                Token::NotEq,
881                Token::NotEq,
882                Token::Lt,
883                Token::LtEq,
884                Token::Gt,
885                Token::GtEq,
886                Token::Plus,
887                Token::Minus,
888                Token::Star,
889                Token::Slash,
890                Token::Eof,
891            ]
892        );
893    }
894
895    #[test]
896    fn punctuation() {
897        assert_eq!(
898            lex("( ) , ; ."),
899            vec![
900                Token::LParen,
901                Token::RParen,
902                Token::Comma,
903                Token::Semicolon,
904                Token::Dot,
905                Token::Eof,
906            ]
907        );
908    }
909
910    #[test]
911    fn line_comment_skipped() {
912        assert_eq!(
913            lex("SELECT -- trailing junk\nFROM"),
914            vec![Token::Select, Token::From, Token::Eof]
915        );
916    }
917
918    #[test]
919    fn block_comment_skipped() {
920        assert_eq!(
921            lex("SELECT /* skipped */ 1"),
922            vec![Token::Select, Token::Integer(1), Token::Eof]
923        );
924    }
925
926    #[test]
927    fn unterminated_string_errors() {
928        let err = tokenize("'oops").unwrap_err();
929        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
930        assert_eq!(err.pos, 0);
931    }
932
933    #[test]
934    fn unterminated_block_comment_errors() {
935        let err = tokenize("/* never closed").unwrap_err();
936        assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
937    }
938
939    #[test]
940    fn unknown_char_errors() {
941        let err = tokenize("@").unwrap_err();
942        assert!(matches!(err.kind, LexErrorKind::UnknownChar('@')));
943    }
944
945    #[test]
946    fn dot_in_qualified_column() {
947        assert_eq!(
948            lex("t.col"),
949            vec![
950                Token::Ident("t".into()),
951                Token::Dot,
952                Token::Ident("col".into()),
953                Token::Eof,
954            ]
955        );
956    }
957
958    // --- v0.11 brackets + distance op + vector keyword --------------------
959
960    #[test]
961    fn brackets_are_distinct_tokens() {
962        assert_eq!(
963            lex("[ ]"),
964            vec![Token::LBracket, Token::RBracket, Token::Eof]
965        );
966    }
967
968    #[test]
969    fn l2_distance_is_three_char_token() {
970        assert_eq!(
971            lex("a <-> b"),
972            vec![
973                Token::Ident("a".into()),
974                Token::L2Distance,
975                Token::Ident("b".into()),
976                Token::Eof,
977            ]
978        );
979        // Bare `<-` should NOT match L2Distance.
980        assert_eq!(
981            lex("a <- b"),
982            vec![
983                Token::Ident("a".into()),
984                Token::Lt,
985                Token::Minus,
986                Token::Ident("b".into()),
987                Token::Eof,
988            ]
989        );
990    }
991
992    #[test]
993    fn order_by_limit_are_keywords() {
994        assert_eq!(
995            lex("ORDER BY LIMIT"),
996            vec![Token::Order, Token::By, Token::Limit, Token::Eof]
997        );
998    }
999
1000    // --- v1.2: pgvector distance ops + PG cast --------------------------
1001
1002    #[test]
1003    fn inner_product_operator_3char() {
1004        assert_eq!(
1005            lex("a <#> b"),
1006            vec![
1007                Token::Ident("a".into()),
1008                Token::InnerProduct,
1009                Token::Ident("b".into()),
1010                Token::Eof,
1011            ]
1012        );
1013    }
1014
1015    #[test]
1016    fn cosine_distance_operator_3char() {
1017        assert_eq!(
1018            lex("a <=> b"),
1019            vec![
1020                Token::Ident("a".into()),
1021                Token::CosineDistance,
1022                Token::Ident("b".into()),
1023                Token::Eof,
1024            ]
1025        );
1026        // Make sure `<=` and `<>` and `<->` still lex right when `<=>` is
1027        // around (greedy match takes the longest).
1028        assert_eq!(
1029            lex("a <= b"),
1030            vec![
1031                Token::Ident("a".into()),
1032                Token::LtEq,
1033                Token::Ident("b".into()),
1034                Token::Eof,
1035            ]
1036        );
1037    }
1038
1039    #[test]
1040    fn double_colon_cast_token() {
1041        assert_eq!(
1042            lex("x::INT"),
1043            vec![
1044                Token::Ident("x".into()),
1045                Token::DoubleColon,
1046                Token::Ident("int".into()),
1047                Token::Eof,
1048            ]
1049        );
1050    }
1051
1052    #[test]
1053    fn lone_single_colon_is_unknown_char() {
1054        let err = tokenize(":x").unwrap_err();
1055        assert!(matches!(err.kind, LexErrorKind::UnknownChar(':')));
1056    }
1057}
spg_sql/lexer.rs

spg_sql/
lexer.rs