Skip to main content

fsqlite_parser/
lexer.rs

1// bd-2tu6: §10.1 SQL Lexer
2//
3// Converts SQL text into a stream of tokens. Uses memchr for accelerated
4// string scanning. Tracks line/column for error reporting.
5
6use fsqlite_ast::Span;
7use fsqlite_types::limits::MAX_VARIABLE_NUMBER;
8use memchr::memchr;
9use std::sync::atomic::{AtomicU64, Ordering};
10use std::time::Instant;
11use tracing::Level;
12
13use crate::token::{Token, TokenKind};
14
15/// Histogram buckets for `fsqlite_tokenize_duration_seconds`.
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
17pub struct TokenizeDurationSecondsHistogram {
18    /// Duration <= 100 µs.
19    pub le_100us: u64,
20    /// Duration <= 250 µs.
21    pub le_250us: u64,
22    /// Duration <= 500 µs.
23    pub le_500us: u64,
24    /// Duration <= 1 ms.
25    pub le_1ms: u64,
26    /// Duration <= 5 ms.
27    pub le_5ms: u64,
28    /// Duration > 5 ms.
29    pub gt_5ms: u64,
30}
31
32/// Point-in-time tokenize metric snapshot.
33#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
34pub struct TokenizeMetricsSnapshot {
35    /// Monotonic token counter across all tokenize calls.
36    pub fsqlite_tokenize_tokens_total: u64,
37    /// Histogram buckets for tokenize runtime.
38    pub fsqlite_tokenize_duration_seconds: TokenizeDurationSecondsHistogram,
39    /// Total tokenize observations recorded in histogram.
40    pub fsqlite_tokenize_duration_seconds_count: u64,
41    /// Sum of tokenize durations in microseconds.
42    pub fsqlite_tokenize_duration_seconds_sum_micros: u64,
43}
44
45static FSQLITE_TOKENIZE_TOKENS_TOTAL: AtomicU64 = AtomicU64::new(0);
46static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US: AtomicU64 = AtomicU64::new(0);
47static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US: AtomicU64 = AtomicU64::new(0);
48static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US: AtomicU64 = AtomicU64::new(0);
49static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS: AtomicU64 = AtomicU64::new(0);
50static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS: AtomicU64 = AtomicU64::new(0);
51static FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS: AtomicU64 = AtomicU64::new(0);
52static FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT: AtomicU64 = AtomicU64::new(0);
53static FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS: AtomicU64 = AtomicU64::new(0);
54
55fn saturating_u64_from_usize(value: usize) -> u64 {
56    u64::try_from(value).unwrap_or(u64::MAX)
57}
58
59fn saturating_u64_from_u128(value: u128) -> u64 {
60    u64::try_from(value).unwrap_or(u64::MAX)
61}
62
63fn record_tokenize_metrics(token_count: usize, elapsed_micros: u64) {
64    FSQLITE_TOKENIZE_TOKENS_TOTAL
65        .fetch_add(saturating_u64_from_usize(token_count), Ordering::Relaxed);
66    FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT.fetch_add(1, Ordering::Relaxed);
67    FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS.fetch_add(elapsed_micros, Ordering::Relaxed);
68
69    let bucket = match elapsed_micros {
70        0..=100 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US,
71        101..=250 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US,
72        251..=500 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US,
73        501..=1_000 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS,
74        1_001..=5_000 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS,
75        _ => &FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS,
76    };
77    bucket.fetch_add(1, Ordering::Relaxed);
78}
79
80/// Point-in-time snapshot of tokenize metrics.
81#[must_use]
82pub fn tokenize_metrics_snapshot() -> TokenizeMetricsSnapshot {
83    TokenizeMetricsSnapshot {
84        fsqlite_tokenize_tokens_total: FSQLITE_TOKENIZE_TOKENS_TOTAL.load(Ordering::Relaxed),
85        fsqlite_tokenize_duration_seconds: TokenizeDurationSecondsHistogram {
86            le_100us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US.load(Ordering::Relaxed),
87            le_250us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US.load(Ordering::Relaxed),
88            le_500us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US.load(Ordering::Relaxed),
89            le_1ms: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS.load(Ordering::Relaxed),
90            le_5ms: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS.load(Ordering::Relaxed),
91            gt_5ms: FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS.load(Ordering::Relaxed),
92        },
93        fsqlite_tokenize_duration_seconds_count: FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT
94            .load(Ordering::Relaxed),
95        fsqlite_tokenize_duration_seconds_sum_micros: FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS
96            .load(Ordering::Relaxed),
97    }
98}
99
100/// Reset tokenize metrics (used by tests/diagnostics).
101pub fn reset_tokenize_metrics() {
102    FSQLITE_TOKENIZE_TOKENS_TOTAL.store(0, Ordering::Relaxed);
103    FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US.store(0, Ordering::Relaxed);
104    FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US.store(0, Ordering::Relaxed);
105    FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US.store(0, Ordering::Relaxed);
106    FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS.store(0, Ordering::Relaxed);
107    FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS.store(0, Ordering::Relaxed);
108    FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS.store(0, Ordering::Relaxed);
109    FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT.store(0, Ordering::Relaxed);
110    FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS.store(0, Ordering::Relaxed);
111}
112
113/// SQL lexer that produces a stream of tokens from source text.
114pub struct Lexer<'a> {
115    /// The source bytes (UTF-8).
116    src: &'a [u8],
117    /// Current byte offset into src.
118    pos: usize,
119    /// Current line number (1-based).
120    line: u32,
121    /// Current column number (1-based).
122    col: u32,
123    /// Whether TRACE character-level logging is enabled.
124    trace_chars: bool,
125}
126
127impl<'a> Lexer<'a> {
128    fn log_token(token: &Token) {
129        tracing::debug!(
130            target: "fsqlite.parse",
131            token = ?token.kind,
132            start = token.span.start,
133            end = token.span.end,
134            line = token.line,
135            col = token.col,
136            "tokenized token"
137        );
138    }
139
140    /// Create a new lexer for the given SQL source text.
141    #[must_use]
142    pub fn new(source: &'a str) -> Self {
143        Self {
144            src: source.as_bytes(),
145            pos: 0,
146            line: 1,
147            col: 1,
148            trace_chars: tracing::enabled!(target: "fsqlite.parse", Level::TRACE),
149        }
150    }
151
152    /// Tokenize the entire input into a Vec of tokens.
153    #[must_use]
154    pub fn tokenize(source: &'a str) -> Vec<Token> {
155        let input_bytes = source.len();
156        let span = tracing::span!(
157            target: "fsqlite.parse",
158            Level::TRACE,
159            "tokenize",
160            token_count = tracing::field::Empty,
161            input_bytes,
162            elapsed_us = tracing::field::Empty,
163        );
164        let _guard = span.enter();
165        let started = Instant::now();
166
167        let mut lexer = Self::new(source);
168        let mut tokens = Vec::with_capacity(input_bytes / 4 + 1);
169        loop {
170            let tok = lexer.next_token();
171            let is_eof = tok.kind == TokenKind::Eof;
172            tokens.push(tok);
173            if is_eof {
174                break;
175            }
176        }
177
178        let elapsed = started.elapsed();
179        let elapsed_us = saturating_u64_from_u128(elapsed.as_micros());
180        span.record("token_count", saturating_u64_from_usize(tokens.len()));
181        span.record("elapsed_us", elapsed_us);
182        record_tokenize_metrics(tokens.len(), elapsed_us);
183        tokens
184    }
185
186    /// Expose tokenize metrics as a snapshot.
187    #[must_use]
188    pub fn metrics_snapshot() -> TokenizeMetricsSnapshot {
189        tokenize_metrics_snapshot()
190    }
191
192    /// Reset tokenize metrics.
193    pub fn reset_metrics() {
194        reset_tokenize_metrics();
195    }
196
197    /// Produce the next token.
198    pub fn next_token(&mut self) -> Token {
199        self.skip_whitespace_and_comments();
200
201        if self.pos >= self.src.len() {
202            let token = self.make_token(TokenKind::Eof, self.pos, self.pos);
203            Self::log_token(&token);
204            return token;
205        }
206
207        let start = self.pos;
208        let start_line = self.line;
209        let start_col = self.col;
210        let ch = self.src[self.pos];
211
212        let kind = match ch {
213            // String literal (single-quoted)
214            b'\'' => self.lex_string(),
215
216            // Double-quoted identifier
217            b'"' => self.lex_double_quoted_id(),
218
219            // Backtick-quoted identifier
220            b'`' => self.lex_backtick_id(),
221
222            // Bracket-quoted identifier
223            b'[' => self.lex_bracket_id(),
224
225            // Blob literal or hex
226            b'X' | b'x' if self.peek_at(1) == Some(b'\'') => self.lex_blob(),
227
228            // Numbers
229            b'0'..=b'9' => self.lex_number(),
230            b'.' if self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) => self.lex_number(),
231
232            // Identifiers and keywords
233            b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF => self.lex_identifier(),
234
235            // Bind parameters
236            b'?' => self.lex_question(),
237            b':' => self.lex_colon_param(),
238            b'@' => self.lex_at_param(),
239            b'$' => self.lex_dollar_param(),
240
241            // Operators and punctuation
242            b'+' => {
243                self.advance();
244                TokenKind::Plus
245            }
246            b'*' => {
247                self.advance();
248                TokenKind::Star
249            }
250            b'/' => {
251                self.advance();
252                TokenKind::Slash
253            }
254            b'%' => {
255                self.advance();
256                TokenKind::Percent
257            }
258            b'&' => {
259                self.advance();
260                TokenKind::Ampersand
261            }
262            b'~' => {
263                self.advance();
264                TokenKind::Tilde
265            }
266            b',' => {
267                self.advance();
268                TokenKind::Comma
269            }
270            b';' => {
271                self.advance();
272                TokenKind::Semicolon
273            }
274            b'(' => {
275                self.advance();
276                TokenKind::LeftParen
277            }
278            b')' => {
279                self.advance();
280                TokenKind::RightParen
281            }
282            b'.' => {
283                self.advance();
284                TokenKind::Dot
285            }
286
287            // Multi-character operators
288            b'-' => self.lex_minus_or_arrow(),
289            b'<' => self.lex_lt(),
290            b'>' => self.lex_gt(),
291            b'=' => self.lex_eq(),
292            b'!' => self.lex_bang(),
293            b'|' => self.lex_pipe(),
294
295            _ => {
296                self.advance();
297                let s = String::from_utf8_lossy(&self.src[start..self.pos]).into_owned();
298                TokenKind::Error(format!("unexpected character: {s}"))
299            }
300        };
301
302        let token = Token {
303            kind,
304            #[allow(clippy::cast_possible_truncation)]
305            span: Span::new(start as u32, self.pos as u32),
306            line: start_line,
307            col: start_col,
308        };
309
310        Self::log_token(&token);
311        token
312    }
313
314    // -----------------------------------------------------------------------
315    // Helpers
316    // -----------------------------------------------------------------------
317
318    #[allow(clippy::cast_possible_truncation)]
319    fn advance_by(&mut self, n: usize) {
320        if n == 0 {
321            return;
322        }
323        let end = self.pos + n;
324        let slice = &self.src[self.pos..end];
325        #[allow(clippy::naive_bytecount)]
326        let newlines = slice.iter().filter(|&&b| b == b'\n').count();
327        if newlines > 0 {
328            self.line += newlines as u32;
329            let last_nl = slice.iter().rposition(|&b| b == b'\n').unwrap_or(0);
330            self.col = (n - last_nl) as u32;
331        } else {
332            self.col += n as u32;
333        }
334        self.pos = end;
335    }
336
337    fn advance(&mut self) -> u8 {
338        let pos = self.pos;
339        let line = self.line;
340        let col = self.col;
341        let ch = self.src[self.pos];
342        self.pos += 1;
343        if ch == b'\n' {
344            self.line += 1;
345            self.col = 1;
346        } else {
347            self.col += 1;
348        }
349        if self.trace_chars {
350            tracing::trace!(
351                target: "fsqlite.parse",
352                byte = ch,
353                pos,
354                line,
355                col,
356                "tokenize char"
357            );
358        }
359        ch
360    }
361
362    fn peek(&self) -> Option<u8> {
363        self.src.get(self.pos).copied()
364    }
365
366    fn peek_at(&self, offset: usize) -> Option<u8> {
367        self.src.get(self.pos + offset).copied()
368    }
369
370    #[allow(clippy::cast_possible_truncation)]
371    fn make_token(&self, kind: TokenKind, start: usize, end: usize) -> Token {
372        Token {
373            kind,
374            span: Span::new(start as u32, end as u32),
375            line: self.line,
376            col: self.col,
377        }
378    }
379
380    /// Skip whitespace, line comments (`--`), and block comments (`/* */`).
381    fn skip_whitespace_and_comments(&mut self) {
382        loop {
383            // Skip whitespace
384            let mut ws_len = 0;
385            while self.pos + ws_len < self.src.len()
386                && self.src[self.pos + ws_len].is_ascii_whitespace()
387            {
388                ws_len += 1;
389            }
390            if ws_len > 0 {
391                self.advance_by(ws_len);
392            }
393
394            if self.pos >= self.src.len() {
395                break;
396            }
397
398            // Line comment: `-- ...`
399            if self.src[self.pos] == b'-' && self.peek_at(1) == Some(b'-') {
400                self.advance(); // skip -
401                self.advance(); // skip -
402                while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
403                    self.advance();
404                }
405                continue;
406            }
407
408            // Block comment: `/* ... */` (SQLite does NOT support nesting)
409            if self.src[self.pos] == b'/' && self.peek_at(1) == Some(b'*') {
410                self.advance(); // skip /
411                self.advance(); // skip *
412                let closed = loop {
413                    if self.pos >= self.src.len() {
414                        break false;
415                    }
416                    if self.src[self.pos] == b'*' && self.peek_at(1) == Some(b'/') {
417                        self.advance();
418                        self.advance();
419                        break true;
420                    }
421                    self.advance();
422                };
423                if !closed {
424                    // Unclosed block comment consumes to EOF
425                    self.pos = self.src.len();
426                }
427                continue;
428            }
429
430            break;
431        }
432    }
433
434    // -----------------------------------------------------------------------
435    // Literal tokenizers
436    // -----------------------------------------------------------------------
437
438    fn lex_string(&mut self) -> TokenKind {
439        let start = self.pos;
440        self.advance(); // skip opening quote
441
442        let mut value = String::new();
443        loop {
444            // Use memchr to find the next single quote quickly
445            let remaining = &self.src[self.pos..];
446            if let Some(offset) = memchr(b'\'', remaining) {
447                // Append bytes up to the quote
448                value.push_str(&String::from_utf8_lossy(
449                    &self.src[self.pos..self.pos + offset],
450                ));
451                // Advance past the accumulated bytes and the quote
452                self.advance_by(offset);
453                self.advance(); // the quote itself
454
455                // Check for escaped quote ('')
456                if self.peek() == Some(b'\'') {
457                    value.push('\'');
458                    self.advance();
459                } else {
460                    return TokenKind::String(value);
461                }
462            } else {
463                // Unterminated string
464                self.pos = self.src.len();
465                return TokenKind::Error(format!(
466                    "unterminated string literal starting at byte {}",
467                    start
468                ));
469            }
470        }
471    }
472
473    /// Lex a double-quoted identifier. Sets the EP_DblQuoted flag.
474    fn lex_double_quoted_id(&mut self) -> TokenKind {
475        let start = self.pos;
476        self.advance(); // skip opening "
477
478        let mut value = String::new();
479        loop {
480            let remaining = &self.src[self.pos..];
481            if let Some(offset) = memchr(b'"', remaining) {
482                value.push_str(&String::from_utf8_lossy(
483                    &self.src[self.pos..self.pos + offset],
484                ));
485                self.advance_by(offset);
486                self.advance(); // the quote
487
488                // Doubled-quote escape: "" -> "
489                if self.peek() == Some(b'"') {
490                    value.push('"');
491                    self.advance();
492                } else {
493                    return TokenKind::QuotedId(value, true);
494                }
495            } else {
496                self.pos = self.src.len();
497                return TokenKind::Error(format!(
498                    "unterminated double-quoted identifier at byte {}",
499                    start
500                ));
501            }
502        }
503    }
504
505    /// Lex a backtick-quoted identifier.
506    fn lex_backtick_id(&mut self) -> TokenKind {
507        let start = self.pos;
508        self.advance(); // skip `
509
510        let mut value = String::new();
511        loop {
512            let remaining = &self.src[self.pos..];
513            if let Some(offset) = memchr(b'`', remaining) {
514                value.push_str(&String::from_utf8_lossy(
515                    &self.src[self.pos..self.pos + offset],
516                ));
517                self.advance_by(offset);
518                self.advance(); // the backtick
519
520                if self.peek() == Some(b'`') {
521                    value.push('`');
522                    self.advance();
523                } else {
524                    return TokenKind::QuotedId(value, false);
525                }
526            } else {
527                self.pos = self.src.len();
528                return TokenKind::Error(format!(
529                    "unterminated backtick identifier at byte {}",
530                    start
531                ));
532            }
533        }
534    }
535
536    /// Lex a bracket-quoted identifier `[name]`.
537    fn lex_bracket_id(&mut self) -> TokenKind {
538        let start = self.pos;
539        self.advance(); // skip [
540
541        let mut value = String::new();
542        let remaining = &self.src[self.pos..];
543        if let Some(offset) = memchr(b']', remaining) {
544            value.push_str(&String::from_utf8_lossy(
545                &self.src[self.pos..self.pos + offset],
546            ));
547            self.advance_by(offset);
548            self.advance(); // skip ]
549            TokenKind::QuotedId(value, false)
550        } else {
551            self.pos = self.src.len();
552            TokenKind::Error(format!("unterminated bracket identifier at byte {}", start))
553        }
554    }
555
556    /// Lex a blob literal `X'...'` / `x'...'`.
557    fn lex_blob(&mut self) -> TokenKind {
558        let start = self.pos;
559        self.advance(); // skip X/x
560        self.advance(); // skip '
561
562        let hex_start = self.pos;
563        let remaining = &self.src[self.pos..];
564        if let Some(offset) = memchr(b'\'', remaining) {
565            let hex_bytes = &self.src[hex_start..hex_start + offset];
566            self.advance_by(offset);
567            self.advance(); // skip closing '
568
569            // Validate hex content
570            if hex_bytes.len() % 2 != 0 {
571                return TokenKind::Error(format!(
572                    "blob literal has odd number of hex digits at byte {}",
573                    start
574                ));
575            }
576
577            // Work directly on raw bytes to avoid panics from
578            // string-slicing multi-byte UTF-8 sequences.
579            let mut bytes = Vec::with_capacity(hex_bytes.len() / 2);
580            for pair in hex_bytes.chunks_exact(2) {
581                let hi = hex_digit(pair[0]);
582                let lo = hex_digit(pair[1]);
583                match (hi, lo) {
584                    (Some(h), Some(l)) => bytes.push((h << 4) | l),
585                    _ => {
586                        return TokenKind::Error(format!(
587                            "invalid hex in blob literal at byte {start}"
588                        ));
589                    }
590                }
591            }
592            TokenKind::Blob(bytes)
593        } else {
594            self.pos = self.src.len();
595            TokenKind::Error(format!("unterminated blob literal at byte {}", start))
596        }
597    }
598
599    /// Lex a number: integer, hex integer, or float.
600    fn lex_number(&mut self) -> TokenKind {
601        let start = self.pos;
602
603        // Check for hex prefix
604        if self.src[self.pos] == b'0' && self.peek_at(1).is_some_and(|c| c == b'x' || c == b'X') {
605            self.advance(); // 0
606            self.advance(); // x
607            let hex_start = self.pos;
608            while self.pos < self.src.len() && self.src[self.pos].is_ascii_hexdigit() {
609                self.advance();
610            }
611            if self.pos == hex_start {
612                return TokenKind::Error("empty hex literal".to_owned());
613            }
614            let hex_str = String::from_utf8_lossy(&self.src[hex_start..self.pos]);
615            // Strip leading zeros then check significant digit count,
616            // matching C SQLite's sqlite3DecOrHexToI64 which rejects
617            // hex literals with >16 significant digits.
618            let significant = hex_str.trim_start_matches('0');
619            if significant.len() > 16 {
620                return TokenKind::Error(format!("hex literal out of range at byte {start}"));
621            }
622            let parse_str = if significant.is_empty() {
623                "0"
624            } else {
625                significant
626            };
627            // Parse as u64 and bitwise-cast to i64 — matching C SQLite's
628            // sqlite3DecOrHexToI64 which uses memcpy(pOut, &u, 8).
629            return match u64::from_str_radix(parse_str, 16) {
630                Ok(v) => {
631                    #[allow(clippy::cast_possible_wrap)]
632                    let i = v as i64;
633                    TokenKind::Integer(i)
634                }
635                Err(_) => TokenKind::Error(format!("hex literal out of range at byte {start}")),
636            };
637        }
638
639        // Decimal integer or float
640        let mut is_float = false;
641
642        // Integer part (may be empty for `.5` style)
643        while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
644            self.advance();
645        }
646
647        // Helper to check if the current position (+ offset) starts a valid exponent.
648        let is_valid_exponent = |lexer: &Self, mut offset: usize| -> bool {
649            if let Some(c) = lexer.peek_at(offset) {
650                if c == b'e' || c == b'E' {
651                    offset += 1;
652                    if let Some(s) = lexer.peek_at(offset) {
653                        if s == b'+' || s == b'-' {
654                            offset += 1;
655                        }
656                    }
657                    if let Some(d) = lexer.peek_at(offset) {
658                        return d.is_ascii_digit();
659                    }
660                }
661            }
662            false
663        };
664
665        // Fractional part
666        if self.pos < self.src.len()
667            && self.src[self.pos] == b'.'
668            && (self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) || is_valid_exponent(self, 1))
669        {
670            is_float = true;
671            self.advance(); // skip dot
672            while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
673                self.advance();
674            }
675        } else if self.pos < self.src.len()
676            && self.src[self.pos] == b'.'
677            && start < self.pos // we had digits before the dot
678            && !self.peek_at(1).is_some_and(|c| c.is_ascii_alphanumeric() || c == b'_')
679        {
680            // e.g. `123.` with nothing meaningful after -- still a float
681            is_float = true;
682            self.advance(); // skip dot
683        }
684
685        // Handle case where input starts with '.'
686        if self.src[start] == b'.' {
687            is_float = true;
688        }
689
690        // Exponent
691        if is_valid_exponent(self, 0) {
692            is_float = true;
693            self.advance(); // skip e/E
694            if self.pos < self.src.len()
695                && (self.src[self.pos] == b'+' || self.src[self.pos] == b'-')
696            {
697                self.advance();
698            }
699            while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
700                self.advance();
701            }
702        }
703
704        // SQLite strictness: a number cannot be immediately followed by an alphabetical character or underscore.
705        // Doing so produces an "unrecognized token" error.
706        if let Some(c) = self.peek() {
707            if c.is_ascii_alphabetic()
708                || c == b'_'
709                || (c == b'.'
710                    && self
711                        .peek_at(1)
712                        .is_some_and(|n| n.is_ascii_alphabetic() || n == b'_'))
713            {
714                let err_start = start;
715                while self.pos < self.src.len() {
716                    let ch = self.src[self.pos];
717                    if ch.is_ascii_alphanumeric() || ch == b'_' || ch == b'.' {
718                        self.advance();
719                    } else {
720                        break;
721                    }
722                }
723                let err_text = String::from_utf8_lossy(&self.src[err_start..self.pos]);
724                return TokenKind::Error(format!("unrecognized token: \"{err_text}\""));
725            }
726        }
727
728        let text = String::from_utf8_lossy(&self.src[start..self.pos]);
729        if is_float {
730            match text.parse::<f64>() {
731                Ok(v) => TokenKind::Float(v),
732                Err(_) => {
733                    // Rust's f64 parser rejects `.e4` but SQLite accepts it as 0.0.
734                    let mut text_fixed = text.clone().into_owned();
735                    if text_fixed.starts_with(".e") || text_fixed.starts_with(".E") {
736                        text_fixed.insert(0, '0');
737                    }
738                    match text_fixed.parse::<f64>() {
739                        Ok(v) => TokenKind::Float(v),
740                        Err(_) => TokenKind::Error(format!("invalid float: {text}")),
741                    }
742                }
743            }
744        } else {
745            match text.parse::<i64>() {
746                Ok(v) => TokenKind::Integer(v),
747                Err(_) => {
748                    // SQLite promotes oversized integers to REAL. We emit a special
749                    // token to allow the parser to fold `-9223372036854775808` correctly.
750                    TokenKind::OversizedInt(text.into_owned())
751                }
752            }
753        }
754    }
755
756    /// Lex an identifier or keyword.
757    fn lex_identifier(&mut self) -> TokenKind {
758        let start = self.pos;
759        self.advance(); // first character already validated
760
761        while self.pos < self.src.len() {
762            let ch = self.src[self.pos];
763            if ch.is_ascii_alphanumeric() || ch == b'_' || ch >= 0x80 {
764                self.advance();
765            } else {
766                break;
767            }
768        }
769
770        let text = String::from_utf8_lossy(&self.src[start..self.pos]).into_owned();
771
772        // Check for keyword
773        if let Some(kw) = TokenKind::lookup_keyword(&text) {
774            kw
775        } else {
776            TokenKind::Id(text)
777        }
778    }
779
780    /// Lex `?` or `?NNN`.
781    fn lex_question(&mut self) -> TokenKind {
782        self.advance(); // skip ?
783        if self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
784            let num_start = self.pos;
785            while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
786                self.advance();
787            }
788            let text = String::from_utf8_lossy(&self.src[num_start..self.pos]);
789            match text.parse::<u32>() {
790                Ok(n) if (1..=MAX_VARIABLE_NUMBER).contains(&n) => TokenKind::QuestionNum(n),
791                Ok(n) => TokenKind::Error(format!(
792                    "variable number must be between ?1 and ?{MAX_VARIABLE_NUMBER}, got ?{n}"
793                )),
794                Err(_) => TokenKind::Error("invalid parameter number".to_owned()),
795            }
796        } else {
797            TokenKind::Question
798        }
799    }
800
801    fn lex_alpha_param(&mut self, prefix: char, constructor: fn(String) -> TokenKind) -> TokenKind {
802        self.advance(); // skip prefix
803        let name_start = self.pos;
804        while self.pos < self.src.len() {
805            let ch = self.src[self.pos];
806            if ch.is_ascii_alphanumeric() || ch == b'_' || ch >= 0x80 {
807                self.advance();
808            } else if ch == b':' && self.peek_at(1) == Some(b':') {
809                self.advance();
810                self.advance();
811            } else if ch == b'(' {
812                self.advance();
813                while self.pos < self.src.len() && self.src[self.pos] != b')' {
814                    self.advance();
815                }
816                if self.pos >= self.src.len() || self.src[self.pos] != b')' {
817                    let name = String::from_utf8_lossy(&self.src[name_start..self.pos]);
818                    return TokenKind::Error(format!("unrecognized token: \"{prefix}{name}\""));
819                }
820                self.advance();
821                break; // Tcl array variable parameters end after the closing paren.
822            } else {
823                break;
824            }
825        }
826        if self.pos == name_start {
827            return TokenKind::Error(format!("empty parameter name after '{prefix}'"));
828        }
829        let name = String::from_utf8_lossy(&self.src[name_start..self.pos]).into_owned();
830        constructor(name)
831    }
832
833    /// Lex `:name`.
834    fn lex_colon_param(&mut self) -> TokenKind {
835        self.lex_alpha_param(':', TokenKind::ColonParam)
836    }
837
838    /// Lex `@name`.
839    fn lex_at_param(&mut self) -> TokenKind {
840        self.lex_alpha_param('@', TokenKind::AtParam)
841    }
842
843    /// Lex `$name`.
844    fn lex_dollar_param(&mut self) -> TokenKind {
845        self.lex_alpha_param('$', TokenKind::DollarParam)
846    }
847
848    // -----------------------------------------------------------------------
849    // Multi-character operator tokenizers
850    // -----------------------------------------------------------------------
851
852    /// Lex `-`, `->`, or `->>`.
853    fn lex_minus_or_arrow(&mut self) -> TokenKind {
854        self.advance(); // skip -
855        if self.peek() == Some(b'>') {
856            self.advance(); // skip >
857            if self.peek() == Some(b'>') {
858                self.advance(); // skip >
859                TokenKind::DoubleArrow
860            } else {
861                TokenKind::Arrow
862            }
863        } else {
864            TokenKind::Minus
865        }
866    }
867
868    /// Lex `<`, `<=`, `<>`, or `<<`.
869    fn lex_lt(&mut self) -> TokenKind {
870        self.advance(); // skip <
871        match self.peek() {
872            Some(b'=') => {
873                self.advance();
874                TokenKind::Le
875            }
876            Some(b'>') => {
877                self.advance();
878                TokenKind::LtGt
879            }
880            Some(b'<') => {
881                self.advance();
882                TokenKind::ShiftLeft
883            }
884            _ => TokenKind::Lt,
885        }
886    }
887
888    /// Lex `>`, `>=`, or `>>`.
889    fn lex_gt(&mut self) -> TokenKind {
890        self.advance(); // skip >
891        match self.peek() {
892            Some(b'=') => {
893                self.advance();
894                TokenKind::Ge
895            }
896            Some(b'>') => {
897                self.advance();
898                TokenKind::ShiftRight
899            }
900            _ => TokenKind::Gt,
901        }
902    }
903
904    /// Lex `=` or `==`.
905    fn lex_eq(&mut self) -> TokenKind {
906        self.advance(); // skip =
907        if self.peek() == Some(b'=') {
908            self.advance();
909            TokenKind::EqEq
910        } else {
911            TokenKind::Eq
912        }
913    }
914
915    /// Lex `!=`.
916    fn lex_bang(&mut self) -> TokenKind {
917        self.advance(); // skip !
918        if self.peek() == Some(b'=') {
919            self.advance();
920            TokenKind::Ne
921        } else {
922            TokenKind::Error("unexpected '!', did you mean '!='?".to_owned())
923        }
924    }
925
926    /// Lex `|` or `||`.
927    fn lex_pipe(&mut self) -> TokenKind {
928        self.advance(); // skip |
929        if self.peek() == Some(b'|') {
930            self.advance();
931            TokenKind::Concat
932        } else {
933            TokenKind::Pipe
934        }
935    }
936}
937
938/// Convert an ASCII hex digit byte to its numeric value (0-15).
939/// Returns `None` for non-hex bytes.
940const fn hex_digit(b: u8) -> Option<u8> {
941    match b {
942        b'0'..=b'9' => Some(b - b'0'),
943        b'a'..=b'f' => Some(b - b'a' + 10),
944        b'A'..=b'F' => Some(b - b'A' + 10),
945        _ => None,
946    }
947}
948
949#[cfg(test)]
950mod tests {
951    use super::*;
952
953    fn lex(src: &str) -> Vec<Token> {
954        Lexer::tokenize(src)
955    }
956
957    fn kinds(src: &str) -> Vec<TokenKind> {
958        lex(src).into_iter().map(|t| t.kind).collect()
959    }
960
961    #[test]
962    fn test_lex_integer_literals() {
963        let tokens = kinds("42 0 0xFF");
964        assert_eq!(
965            tokens,
966            vec![
967                TokenKind::Integer(42),
968                TokenKind::Integer(0),
969                TokenKind::Integer(255),
970                TokenKind::Eof,
971            ]
972        );
973    }
974
975    #[test]
976    fn test_lex_float_literals() {
977        let tokens = kinds("3.14 1e10 .5 1.0e-3 0.0");
978        // Avoid clippy::approx_constant (3.14 is interpreted as an approximation of PI),
979        // but keep the test input string stable.
980        let expected = 3.0 + 0.14;
981        assert!(matches!(
982            tokens[0],
983            TokenKind::Float(v) if (v - expected).abs() < 1e-10
984        ));
985        assert!(matches!(tokens[1], TokenKind::Float(v) if (v - 1e10).abs() < 1.0));
986        assert!(matches!(tokens[2], TokenKind::Float(v) if (v - 0.5).abs() < 1e-10));
987        assert!(matches!(tokens[3], TokenKind::Float(v) if (v - 0.001).abs() < 1e-10));
988        assert!(matches!(tokens[4], TokenKind::Float(v) if v.abs() < 1e-10));
989        assert_eq!(tokens[5], TokenKind::Eof);
990    }
991
992    #[test]
993    fn test_lex_string_literals() {
994        let tokens = kinds("'hello' 'it''s' ''");
995        assert_eq!(tokens[0], TokenKind::String("hello".to_owned()));
996        assert_eq!(tokens[1], TokenKind::String("it's".to_owned()));
997        assert_eq!(tokens[2], TokenKind::String(String::new()));
998        assert_eq!(tokens[3], TokenKind::Eof);
999    }
1000
1001    #[test]
1002    fn test_lex_blob_literals() {
1003        let tokens = kinds("X'CAFE' x'00ff' X''");
1004        assert_eq!(tokens[0], TokenKind::Blob(vec![0xCA, 0xFE]));
1005        assert_eq!(tokens[1], TokenKind::Blob(vec![0x00, 0xFF]));
1006        assert_eq!(tokens[2], TokenKind::Blob(vec![]));
1007        assert_eq!(tokens[3], TokenKind::Eof);
1008    }
1009
1010    #[test]
1011    fn test_lex_blob_odd_hex_error() {
1012        let tokens = kinds("X'CAF'");
1013        assert!(matches!(tokens[0], TokenKind::Error(_)));
1014    }
1015
1016    #[test]
1017    fn test_lex_blob_non_ascii_no_panic() {
1018        // bd-20gf regression: multi-byte UTF-8 inside a blob literal must
1019        // produce an error, not panic on string-slice boundary.
1020        let tokens = kinds("X'U\u{05fc} '");
1021        assert!(matches!(tokens[0], TokenKind::Error(_)));
1022
1023        // Also test with raw non-hex ASCII chars.
1024        let tokens2 = kinds("X'GG'");
1025        assert!(matches!(tokens2[0], TokenKind::Error(_)));
1026    }
1027
1028    #[test]
1029    fn test_lex_variables() {
1030        let tokens = kinds("?1 :name @param $var ?");
1031        assert_eq!(tokens[0], TokenKind::QuestionNum(1));
1032        assert_eq!(tokens[1], TokenKind::ColonParam("name".to_owned()));
1033        assert_eq!(tokens[2], TokenKind::AtParam("param".to_owned()));
1034        assert_eq!(tokens[3], TokenKind::DollarParam("var".to_owned()));
1035        assert_eq!(tokens[4], TokenKind::Question);
1036        assert_eq!(tokens[5], TokenKind::Eof);
1037    }
1038
1039    #[test]
1040    fn test_lex_quoted_identifiers() {
1041        let tokens = kinds("\"table_name\" [column] `backtick`");
1042        assert_eq!(
1043            tokens[0],
1044            TokenKind::QuotedId("table_name".to_owned(), true)
1045        );
1046        assert_eq!(tokens[1], TokenKind::QuotedId("column".to_owned(), false));
1047        assert_eq!(tokens[2], TokenKind::QuotedId("backtick".to_owned(), false));
1048    }
1049
1050    #[test]
1051    fn test_lex_dqs_flag() {
1052        let tokens = kinds("\"hello\"");
1053        // Double-quoted strings produce QuotedId with EP_DblQuoted=true
1054        assert_eq!(tokens[0], TokenKind::QuotedId("hello".to_owned(), true));
1055    }
1056
1057    #[test]
1058    fn test_lex_keywords() {
1059        let tokens = kinds("SELECT FROM WHERE INSERT CREATE TABLE CONCURRENT");
1060        assert_eq!(tokens[0], TokenKind::KwSelect);
1061        assert_eq!(tokens[1], TokenKind::KwFrom);
1062        assert_eq!(tokens[2], TokenKind::KwWhere);
1063        assert_eq!(tokens[3], TokenKind::KwInsert);
1064        assert_eq!(tokens[4], TokenKind::KwCreate);
1065        assert_eq!(tokens[5], TokenKind::KwTable);
1066        assert_eq!(tokens[6], TokenKind::KwConcurrent);
1067
1068        // Case insensitivity
1069        let tokens2 = kinds("select from where");
1070        assert_eq!(tokens2[0], TokenKind::KwSelect);
1071        assert_eq!(tokens2[1], TokenKind::KwFrom);
1072        assert_eq!(tokens2[2], TokenKind::KwWhere);
1073    }
1074
1075    #[test]
1076    fn test_lex_operators() {
1077        let tokens = kinds("+ - * / % & | ~ << >> = < <= > >= == != <> || -> ->>");
1078        let expected = vec![
1079            TokenKind::Plus,
1080            TokenKind::Minus,
1081            TokenKind::Star,
1082            TokenKind::Slash,
1083            TokenKind::Percent,
1084            TokenKind::Ampersand,
1085            TokenKind::Pipe,
1086            TokenKind::Tilde,
1087            TokenKind::ShiftLeft,
1088            TokenKind::ShiftRight,
1089            TokenKind::Eq,
1090            TokenKind::Lt,
1091            TokenKind::Le,
1092            TokenKind::Gt,
1093            TokenKind::Ge,
1094            TokenKind::EqEq,
1095            TokenKind::Ne,
1096            TokenKind::LtGt,
1097            TokenKind::Concat,
1098            TokenKind::Arrow,
1099            TokenKind::DoubleArrow,
1100            TokenKind::Eof,
1101        ];
1102        assert_eq!(tokens, expected);
1103    }
1104
1105    #[test]
1106    fn test_lex_eq_vs_eqeq() {
1107        let tokens = kinds("= ==");
1108        assert_eq!(tokens[0], TokenKind::Eq);
1109        assert_eq!(tokens[1], TokenKind::EqEq);
1110    }
1111
1112    #[test]
1113    fn test_lex_ne_vs_ltgt() {
1114        let tokens = kinds("!= <>");
1115        assert_eq!(tokens[0], TokenKind::Ne);
1116        assert_eq!(tokens[1], TokenKind::LtGt);
1117    }
1118
1119    #[test]
1120    fn test_lex_error_unterminated_string() {
1121        let tokens = kinds("'hello");
1122        assert!(matches!(tokens[0], TokenKind::Error(_)));
1123    }
1124
1125    #[test]
1126    fn test_lex_line_column_tracking() {
1127        let tokens = lex("SELECT\n  a,\n  b");
1128        assert_eq!(tokens[0].line, 1);
1129        assert_eq!(tokens[0].col, 1);
1130        // 'a' is on line 2, col 3
1131        assert_eq!(tokens[1].line, 2);
1132        assert_eq!(tokens[1].col, 3);
1133        // ',' is on line 2, col 4
1134        assert_eq!(tokens[2].line, 2);
1135        assert_eq!(tokens[2].col, 4);
1136        // 'b' is on line 3, col 3
1137        assert_eq!(tokens[3].line, 3);
1138        assert_eq!(tokens[3].col, 3);
1139    }
1140
1141    #[test]
1142    fn test_lex_whitespace_and_comments_skipped() {
1143        let tokens = kinds("SELECT -- this is a comment\n  a /* block */ FROM b");
1144        assert_eq!(tokens[0], TokenKind::KwSelect);
1145        assert_eq!(tokens[1], TokenKind::Id("a".to_owned()));
1146        assert_eq!(tokens[2], TokenKind::KwFrom);
1147        assert_eq!(tokens[3], TokenKind::Id("b".to_owned()));
1148        assert_eq!(tokens[4], TokenKind::Eof);
1149    }
1150
1151    #[test]
1152    fn test_lex_hex_large_values() {
1153        // C SQLite parses hex as u64 and memcpy to i64.
1154        // 0xFFFFFFFFFFFFFFFF = u64::MAX → i64 -1.
1155        let tokens = kinds("0xFFFFFFFFFFFFFFFF");
1156        assert_eq!(tokens[0], TokenKind::Integer(-1));
1157
1158        // 0x8000000000000000 = i64::MIN.
1159        let tokens = kinds("0x8000000000000000");
1160        assert_eq!(tokens[0], TokenKind::Integer(i64::MIN));
1161
1162        // 0x7FFFFFFFFFFFFFFF = i64::MAX.
1163        let tokens = kinds("0x7FFFFFFFFFFFFFFF");
1164        assert_eq!(tokens[0], TokenKind::Integer(i64::MAX));
1165    }
1166
1167    #[test]
1168    fn test_lex_hex_overflow_17_digits_rejects() {
1169        // 0x10000000000000000 has 17 significant hex digits → must error,
1170        // not silently truncate to 0.
1171        let tokens = kinds("0x10000000000000000");
1172        assert!(
1173            matches!(&tokens[0], TokenKind::Error(msg) if msg.contains("out of range")),
1174            "expected error for 17-digit hex, got {:?}",
1175            tokens[0]
1176        );
1177    }
1178
1179    #[test]
1180    fn test_lex_hex_leading_zeros_accepted() {
1181        // Leading zeros are stripped before the length check, so
1182        // 0x00000000000000001 (17 chars, 1 significant) is valid.
1183        let tokens = kinds("0x00000000000000001");
1184        assert_eq!(tokens[0], TokenKind::Integer(1));
1185    }
1186
1187    #[test]
1188    fn test_lex_number_hex() {
1189        let tokens = kinds("0x1A 0Xff 0x0");
1190        assert_eq!(tokens[0], TokenKind::Integer(26));
1191        assert_eq!(tokens[1], TokenKind::Integer(255));
1192        assert_eq!(tokens[2], TokenKind::Integer(0));
1193        assert_eq!(tokens[3], TokenKind::Eof);
1194    }
1195
1196    #[test]
1197    fn test_lex_number_unrecognized() {
1198        let tokens = kinds("123a 123.a");
1199        assert!(
1200            matches!(tokens[0], TokenKind::Error(ref e) if e.contains("unrecognized token: \"123a\""))
1201        );
1202        assert!(
1203            matches!(tokens[1], TokenKind::Error(ref e) if e.contains("unrecognized token: \"123.a\""))
1204        );
1205    }
1206
1207    #[test]
1208    fn test_lex_number_hex_invalid() {
1209        let tokens = kinds("0x");
1210        assert!(matches!(tokens[0], TokenKind::Error(_)));
1211    }
1212
1213    #[test]
1214    fn test_lex_positional_params() {
1215        let tokens = kinds("? ?123");
1216        assert_eq!(tokens[0], TokenKind::Question);
1217        assert_eq!(tokens[1], TokenKind::QuestionNum(123));
1218        assert_eq!(tokens[2], TokenKind::Eof);
1219    }
1220
1221    #[test]
1222    fn test_lex_positional_params_reject_zero_and_out_of_range() {
1223        let tokens = kinds("?0 ?32767");
1224        assert!(
1225            matches!(tokens[0], TokenKind::Error(ref e) if e.contains("between ?1 and ?32766")),
1226            "expected ?0 to be rejected, got {:?}",
1227            tokens[0]
1228        );
1229        assert!(
1230            matches!(tokens[1], TokenKind::Error(ref e) if e.contains("between ?1 and ?32766")),
1231            "expected ?32767 to be rejected, got {:?}",
1232            tokens[1]
1233        );
1234        assert_eq!(tokens[2], TokenKind::Eof);
1235    }
1236
1237    #[test]
1238    fn test_lex_named_params() {
1239        let tokens = kinds(":foo @bar $baz_123");
1240        assert_eq!(tokens[0], TokenKind::ColonParam("foo".to_owned()));
1241        assert_eq!(tokens[1], TokenKind::AtParam("bar".to_owned()));
1242        assert_eq!(tokens[2], TokenKind::DollarParam("baz_123".to_owned()));
1243        assert_eq!(tokens[3], TokenKind::Eof);
1244    }
1245
1246    #[test]
1247    fn test_lex_named_params_with_tcl_syntax() {
1248        let tokens = kinds("$::foo(bar) :a::b");
1249        assert_eq!(tokens[0], TokenKind::DollarParam("::foo(bar)".to_owned()));
1250        assert_eq!(tokens[1], TokenKind::ColonParam("a::b".to_owned()));
1251        assert_eq!(tokens[2], TokenKind::Eof);
1252    }
1253
1254    #[test]
1255    fn test_lex_named_params_with_unclosed_tcl_array_syntax() {
1256        let tokens = kinds("$::foo(bar");
1257        assert!(
1258            matches!(tokens[0], TokenKind::Error(ref e) if e.contains("unrecognized token")),
1259            "expected unterminated Tcl-style parameter to be rejected, got {:?}",
1260            tokens[0]
1261        );
1262        assert_eq!(tokens[1], TokenKind::Eof);
1263    }
1264
1265    fn histogram_total(hist: &TokenizeDurationSecondsHistogram) -> u64 {
1266        hist.le_100us + hist.le_250us + hist.le_500us + hist.le_1ms + hist.le_5ms + hist.gt_5ms
1267    }
1268
1269    #[test]
1270    fn test_tokenize_metrics_accumulate_tokens_and_histogram_samples() {
1271        reset_tokenize_metrics();
1272
1273        let first = lex("SELECT 1;");
1274        let second = lex("SELECT 2;");
1275
1276        let expected_total_tokens =
1277            u64::try_from(first.len() + second.len()).expect("small token vectors should fit");
1278        let snap = tokenize_metrics_snapshot();
1279        assert_eq!(snap.fsqlite_tokenize_tokens_total, expected_total_tokens);
1280        assert_eq!(snap.fsqlite_tokenize_duration_seconds_count, 2);
1281        assert_eq!(
1282            histogram_total(&snap.fsqlite_tokenize_duration_seconds),
1283            snap.fsqlite_tokenize_duration_seconds_count
1284        );
1285    }
1286
1287    #[test]
1288    fn test_tokenize_metrics_reset_clears_all_fields() {
1289        reset_tokenize_metrics();
1290        let _ = lex("SELECT 42;");
1291
1292        let before = tokenize_metrics_snapshot();
1293        assert!(before.fsqlite_tokenize_tokens_total > 0);
1294        assert!(before.fsqlite_tokenize_duration_seconds_count > 0);
1295
1296        reset_tokenize_metrics();
1297        let after = tokenize_metrics_snapshot();
1298        assert_eq!(after.fsqlite_tokenize_tokens_total, 0);
1299        assert_eq!(after.fsqlite_tokenize_duration_seconds_count, 0);
1300        assert_eq!(after.fsqlite_tokenize_duration_seconds_sum_micros, 0);
1301        assert_eq!(histogram_total(&after.fsqlite_tokenize_duration_seconds), 0);
1302    }
1303}