1use fsqlite_ast::Span;
7use fsqlite_types::limits::MAX_VARIABLE_NUMBER;
8use hashbrown::HashSet;
9use memchr::memchr;
10use std::sync::Arc;
11use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
12use std::time::Instant;
13use tracing::Level;
14
15use crate::token::{Token, TokenKind};
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
19pub struct TokenizeDurationSecondsHistogram {
20 pub le_100us: u64,
22 pub le_250us: u64,
24 pub le_500us: u64,
26 pub le_1ms: u64,
28 pub le_5ms: u64,
30 pub gt_5ms: u64,
32}
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
36pub struct TokenizeMetricsSnapshot {
37 pub fsqlite_tokenize_tokens_total: u64,
39 pub fsqlite_tokenize_duration_seconds: TokenizeDurationSecondsHistogram,
41 pub fsqlite_tokenize_duration_seconds_count: u64,
43 pub fsqlite_tokenize_duration_seconds_sum_micros: u64,
45}
46
47static FSQLITE_TOKENIZE_TOKENS_TOTAL: AtomicU64 = AtomicU64::new(0);
48static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US: AtomicU64 = AtomicU64::new(0);
49static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US: AtomicU64 = AtomicU64::new(0);
50static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US: AtomicU64 = AtomicU64::new(0);
51static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS: AtomicU64 = AtomicU64::new(0);
52static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS: AtomicU64 = AtomicU64::new(0);
53static FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS: AtomicU64 = AtomicU64::new(0);
54static FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT: AtomicU64 = AtomicU64::new(0);
55static FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS: AtomicU64 = AtomicU64::new(0);
56static FSQLITE_TOKENIZE_METRICS_ENABLED: AtomicBool = AtomicBool::new(false);
57
58fn saturating_u64_from_usize(value: usize) -> u64 {
59 u64::try_from(value).unwrap_or(u64::MAX)
60}
61
62fn saturating_u64_from_u128(value: u128) -> u64 {
63 u64::try_from(value).unwrap_or(u64::MAX)
64}
65
66fn record_tokenize_metrics(token_count: usize, elapsed_micros: u64) {
67 FSQLITE_TOKENIZE_TOKENS_TOTAL
68 .fetch_add(saturating_u64_from_usize(token_count), Ordering::Relaxed);
69 FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT.fetch_add(1, Ordering::Relaxed);
70 FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS.fetch_add(elapsed_micros, Ordering::Relaxed);
71
72 let bucket = match elapsed_micros {
73 0..=100 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US,
74 101..=250 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US,
75 251..=500 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US,
76 501..=1_000 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS,
77 1_001..=5_000 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS,
78 _ => &FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS,
79 };
80 bucket.fetch_add(1, Ordering::Relaxed);
81}
82
83#[must_use]
85pub fn tokenize_metrics_snapshot() -> TokenizeMetricsSnapshot {
86 TokenizeMetricsSnapshot {
87 fsqlite_tokenize_tokens_total: FSQLITE_TOKENIZE_TOKENS_TOTAL.load(Ordering::Relaxed),
88 fsqlite_tokenize_duration_seconds: TokenizeDurationSecondsHistogram {
89 le_100us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US.load(Ordering::Relaxed),
90 le_250us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US.load(Ordering::Relaxed),
91 le_500us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US.load(Ordering::Relaxed),
92 le_1ms: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS.load(Ordering::Relaxed),
93 le_5ms: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS.load(Ordering::Relaxed),
94 gt_5ms: FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS.load(Ordering::Relaxed),
95 },
96 fsqlite_tokenize_duration_seconds_count: FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT
97 .load(Ordering::Relaxed),
98 fsqlite_tokenize_duration_seconds_sum_micros: FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS
99 .load(Ordering::Relaxed),
100 }
101}
102
103pub fn set_tokenize_metrics_enabled(enabled: bool) {
105 FSQLITE_TOKENIZE_METRICS_ENABLED.store(enabled, Ordering::Relaxed);
106}
107
108#[must_use]
110pub fn tokenize_metrics_enabled() -> bool {
111 FSQLITE_TOKENIZE_METRICS_ENABLED.load(Ordering::Relaxed)
112}
113
114pub fn reset_tokenize_metrics() {
116 FSQLITE_TOKENIZE_TOKENS_TOTAL.store(0, Ordering::Relaxed);
117 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US.store(0, Ordering::Relaxed);
118 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US.store(0, Ordering::Relaxed);
119 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US.store(0, Ordering::Relaxed);
120 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS.store(0, Ordering::Relaxed);
121 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS.store(0, Ordering::Relaxed);
122 FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS.store(0, Ordering::Relaxed);
123 FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT.store(0, Ordering::Relaxed);
124 FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS.store(0, Ordering::Relaxed);
125}
126
127const MAX_RETAINED_IDENTIFIER_INTERNER_ENTRIES: usize = 256;
129const MAX_RETAINED_IDENTIFIER_INTERNER_BYTES: usize = 16 * 1024;
130
131#[derive(Debug, Default)]
133pub(crate) struct IdentifierInterner {
134 values: HashSet<Arc<str>>,
135}
136
137impl IdentifierInterner {
138 fn intern(&mut self, value: &str) -> Arc<str> {
139 if let Some(existing) = self.values.get(value) {
140 return Arc::clone(existing);
141 }
142
143 let interned: Arc<str> = Arc::from(value);
144 let inserted = Arc::clone(&interned);
145 self.values.insert(interned);
146 inserted
147 }
148
149 pub(crate) fn reset(&mut self) {
150 self.values = HashSet::new();
151 }
152
153 pub(crate) fn retained_bytes(&self) -> usize {
154 let interned_value_bytes = self
155 .values
156 .iter()
157 .fold(0usize, |sum, value| sum.saturating_add(value.len()));
158 self.values
159 .capacity()
160 .saturating_mul(std::mem::size_of::<Arc<str>>())
161 .saturating_add(interned_value_bytes)
162 }
163
164 pub(crate) fn prepare_for_next_parse(&mut self) {
165 if self.values.len() > MAX_RETAINED_IDENTIFIER_INTERNER_ENTRIES
166 || self.retained_bytes() > MAX_RETAINED_IDENTIFIER_INTERNER_BYTES
167 {
168 self.reset();
169 }
170 }
171
172 #[cfg(test)]
173 pub(crate) fn is_empty(&self) -> bool {
174 self.values.is_empty()
175 }
176
177 #[cfg(test)]
178 pub(crate) fn len(&self) -> usize {
179 self.values.len()
180 }
181}
182
183pub struct Lexer<'a> {
185 src: &'a [u8],
187 pos: usize,
189 line: u32,
191 col: u32,
193 trace_chars: bool,
195 interner: IdentifierInterner,
197}
198
199impl<'a> Lexer<'a> {
200 fn log_token(token: &Token) {
201 tracing::debug!(
202 target: "fsqlite.parse",
203 token = ?token.kind,
204 start = token.span.start,
205 end = token.span.end,
206 line = token.line,
207 col = token.col,
208 "tokenized token"
209 );
210 }
211
212 #[must_use]
214 pub fn new(source: &'a str) -> Self {
215 Self {
216 src: source.as_bytes(),
217 pos: 0,
218 line: 1,
219 col: 1,
220 trace_chars: tracing::enabled!(target: "fsqlite.parse", Level::TRACE),
221 interner: IdentifierInterner::default(),
222 }
223 }
224
225 #[must_use]
227 pub fn tokenize(source: &'a str) -> Vec<Token> {
228 let mut tokens = Vec::new();
229 Self::tokenize_into(source, &mut tokens);
230 tokens
231 }
232
233 fn new_with_interner(source: &'a str, interner: IdentifierInterner) -> Self {
234 Self {
235 src: source.as_bytes(),
236 pos: 0,
237 line: 1,
238 col: 1,
239 trace_chars: tracing::enabled!(target: "fsqlite.parse", Level::TRACE),
240 interner,
241 }
242 }
243
244 pub fn tokenize_into(source: &'a str, tokens: &mut Vec<Token>) {
250 let mut interner = IdentifierInterner::default();
251 Self::tokenize_into_with_interner(source, tokens, &mut interner);
252 }
253
254 pub(crate) fn tokenize_into_with_interner(
255 source: &'a str,
256 tokens: &mut Vec<Token>,
257 interner: &mut IdentifierInterner,
258 ) {
259 let input_bytes = source.len();
260 let collect_tokenize_metrics = tokenize_metrics_enabled();
261 let trace_tokenize = tracing::enabled!(target: "fsqlite.parse", Level::TRACE);
262 let span = trace_tokenize.then(|| {
263 tracing::span!(
264 target: "fsqlite.parse",
265 Level::TRACE,
266 "tokenize",
267 token_count = tracing::field::Empty,
268 input_bytes,
269 elapsed_us = tracing::field::Empty,
270 )
271 });
272 let _guard = span.as_ref().map(|span| span.enter());
273 let started = (collect_tokenize_metrics || trace_tokenize).then(Instant::now);
274
275 let mut lexer = Self::new_with_interner(source, std::mem::take(interner));
276 let target_capacity = input_bytes / 4 + 1;
277 tokens.clear();
278 if target_capacity > tokens.capacity() {
279 tokens.reserve(target_capacity - tokens.capacity());
280 }
281 loop {
282 let tok = lexer.next_token();
283 let is_eof = tok.kind == TokenKind::Eof;
284 tokens.push(tok);
285 if is_eof {
286 break;
287 }
288 }
289
290 *interner = lexer.interner;
291
292 if let Some(started) = started {
293 let elapsed_us = saturating_u64_from_u128(started.elapsed().as_micros());
294 if let Some(span) = span.as_ref() {
295 span.record("token_count", saturating_u64_from_usize(tokens.len()));
296 span.record("elapsed_us", elapsed_us);
297 }
298 if collect_tokenize_metrics {
299 record_tokenize_metrics(tokens.len(), elapsed_us);
300 }
301 }
302 }
303
304 #[must_use]
306 pub fn metrics_snapshot() -> TokenizeMetricsSnapshot {
307 tokenize_metrics_snapshot()
308 }
309
310 pub fn reset_metrics() {
312 reset_tokenize_metrics();
313 }
314
315 pub fn next_token(&mut self) -> Token {
317 self.skip_whitespace_and_comments();
318
319 if self.pos >= self.src.len() {
320 let token = self.make_token(TokenKind::Eof, self.pos, self.pos);
321 Self::log_token(&token);
322 return token;
323 }
324
325 let start = self.pos;
326 let start_line = self.line;
327 let start_col = self.col;
328 let ch = self.src[self.pos];
329
330 let kind = match ch {
331 b'\'' => self.lex_string(),
333
334 b'"' => self.lex_double_quoted_id(),
336
337 b'`' => self.lex_backtick_id(),
339
340 b'[' => self.lex_bracket_id(),
342
343 b'X' | b'x' if self.peek_at(1) == Some(b'\'') => self.lex_blob(),
345
346 b'0'..=b'9' => self.lex_number(),
348 b'.' if self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) => self.lex_number(),
349
350 b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF => self.lex_identifier(),
352
353 b'?' => self.lex_question(),
355 b':' => self.lex_colon_param(),
356 b'@' => self.lex_at_param(),
357 b'$' => self.lex_dollar_param(),
358
359 b'+' => {
361 self.advance();
362 TokenKind::Plus
363 }
364 b'*' => {
365 self.advance();
366 TokenKind::Star
367 }
368 b'/' => {
369 self.advance();
370 TokenKind::Slash
371 }
372 b'%' => {
373 self.advance();
374 TokenKind::Percent
375 }
376 b'&' => {
377 self.advance();
378 TokenKind::Ampersand
379 }
380 b'~' => {
381 self.advance();
382 TokenKind::Tilde
383 }
384 b',' => {
385 self.advance();
386 TokenKind::Comma
387 }
388 b';' => {
389 self.advance();
390 TokenKind::Semicolon
391 }
392 b'(' => {
393 self.advance();
394 TokenKind::LeftParen
395 }
396 b')' => {
397 self.advance();
398 TokenKind::RightParen
399 }
400 b'.' => {
401 self.advance();
402 TokenKind::Dot
403 }
404
405 b'-' => self.lex_minus_or_arrow(),
407 b'<' => self.lex_lt(),
408 b'>' => self.lex_gt(),
409 b'=' => self.lex_eq(),
410 b'!' => self.lex_bang(),
411 b'|' => self.lex_pipe(),
412
413 _ => {
414 self.advance();
415 let s = String::from_utf8_lossy(&self.src[start..self.pos]).into_owned();
416 TokenKind::Error(format!("unexpected character: {s}"))
417 }
418 };
419
420 let token = Token {
421 kind,
422 #[allow(clippy::cast_possible_truncation)]
423 span: Span::new(start as u32, self.pos as u32),
424 line: start_line,
425 col: start_col,
426 };
427
428 Self::log_token(&token);
429 token
430 }
431
432 #[allow(clippy::cast_possible_truncation)]
437 fn advance_by(&mut self, n: usize) {
438 if n == 0 {
439 return;
440 }
441 let end = self.pos + n;
442 let slice = &self.src[self.pos..end];
443 #[allow(clippy::naive_bytecount)]
444 let newlines = slice.iter().filter(|&&b| b == b'\n').count();
445 if newlines > 0 {
446 self.line += newlines as u32;
447 let last_nl = slice.iter().rposition(|&b| b == b'\n').unwrap_or(0);
448 self.col = (n - last_nl) as u32;
449 } else {
450 self.col += n as u32;
451 }
452 self.pos = end;
453 }
454
455 fn advance(&mut self) -> u8 {
456 let pos = self.pos;
457 let line = self.line;
458 let col = self.col;
459 let ch = self.src[self.pos];
460 self.pos += 1;
461 if ch == b'\n' {
462 self.line += 1;
463 self.col = 1;
464 } else {
465 self.col += 1;
466 }
467 if self.trace_chars {
468 tracing::trace!(
469 target: "fsqlite.parse",
470 byte = ch,
471 pos,
472 line,
473 col,
474 "tokenize char"
475 );
476 }
477 ch
478 }
479
480 fn peek(&self) -> Option<u8> {
481 self.src.get(self.pos).copied()
482 }
483
484 fn peek_at(&self, offset: usize) -> Option<u8> {
485 self.src.get(self.pos + offset).copied()
486 }
487
488 #[allow(clippy::cast_possible_truncation)]
489 fn make_token(&self, kind: TokenKind, start: usize, end: usize) -> Token {
490 Token {
491 kind,
492 span: Span::new(start as u32, end as u32),
493 line: self.line,
494 col: self.col,
495 }
496 }
497
498 fn skip_whitespace_and_comments(&mut self) {
500 loop {
501 let mut ws_len = 0;
503 while self.pos + ws_len < self.src.len()
504 && self.src[self.pos + ws_len].is_ascii_whitespace()
505 {
506 ws_len += 1;
507 }
508 if ws_len > 0 {
509 self.advance_by(ws_len);
510 }
511
512 if self.pos >= self.src.len() {
513 break;
514 }
515
516 if self.src[self.pos] == b'-' && self.peek_at(1) == Some(b'-') {
518 self.advance(); self.advance(); while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
521 self.advance();
522 }
523 continue;
524 }
525
526 if self.src[self.pos] == b'/' && self.peek_at(1) == Some(b'*') {
528 self.advance(); self.advance(); let closed = loop {
531 if self.pos >= self.src.len() {
532 break false;
533 }
534 if self.src[self.pos] == b'*' && self.peek_at(1) == Some(b'/') {
535 self.advance();
536 self.advance();
537 break true;
538 }
539 self.advance();
540 };
541 if !closed {
542 self.pos = self.src.len();
544 }
545 continue;
546 }
547
548 break;
549 }
550 }
551
552 fn lex_string(&mut self) -> TokenKind {
557 let start = self.pos;
558 self.advance(); let mut value = String::new();
561 loop {
562 let remaining = &self.src[self.pos..];
564 if let Some(offset) = memchr(b'\'', remaining) {
565 value.push_str(&String::from_utf8_lossy(
567 &self.src[self.pos..self.pos + offset],
568 ));
569 self.advance_by(offset);
571 self.advance(); if self.peek() == Some(b'\'') {
575 value.push('\'');
576 self.advance();
577 } else {
578 return TokenKind::String(value);
579 }
580 } else {
581 self.pos = self.src.len();
583 return TokenKind::Error(format!(
584 "unterminated string literal starting at byte {}",
585 start
586 ));
587 }
588 }
589 }
590
591 fn lex_double_quoted_id(&mut self) -> TokenKind {
593 let start = self.pos;
594 self.advance(); let mut value = String::new();
597 loop {
598 let remaining = &self.src[self.pos..];
599 if let Some(offset) = memchr(b'"', remaining) {
600 value.push_str(&String::from_utf8_lossy(
601 &self.src[self.pos..self.pos + offset],
602 ));
603 self.advance_by(offset);
604 self.advance(); if self.peek() == Some(b'"') {
608 value.push('"');
609 self.advance();
610 } else {
611 return TokenKind::QuotedId(self.interner.intern(&value), true);
612 }
613 } else {
614 self.pos = self.src.len();
615 return TokenKind::Error(format!(
616 "unterminated double-quoted identifier at byte {}",
617 start
618 ));
619 }
620 }
621 }
622
623 fn lex_backtick_id(&mut self) -> TokenKind {
625 let start = self.pos;
626 self.advance(); let mut value = String::new();
629 loop {
630 let remaining = &self.src[self.pos..];
631 if let Some(offset) = memchr(b'`', remaining) {
632 value.push_str(&String::from_utf8_lossy(
633 &self.src[self.pos..self.pos + offset],
634 ));
635 self.advance_by(offset);
636 self.advance(); if self.peek() == Some(b'`') {
639 value.push('`');
640 self.advance();
641 } else {
642 return TokenKind::QuotedId(self.interner.intern(&value), false);
643 }
644 } else {
645 self.pos = self.src.len();
646 return TokenKind::Error(format!(
647 "unterminated backtick identifier at byte {}",
648 start
649 ));
650 }
651 }
652 }
653
654 fn lex_bracket_id(&mut self) -> TokenKind {
656 let start = self.pos;
657 self.advance(); let mut value = String::new();
660 let remaining = &self.src[self.pos..];
661 if let Some(offset) = memchr(b']', remaining) {
662 value.push_str(&String::from_utf8_lossy(
663 &self.src[self.pos..self.pos + offset],
664 ));
665 self.advance_by(offset);
666 self.advance(); TokenKind::QuotedId(self.interner.intern(&value), false)
668 } else {
669 self.pos = self.src.len();
670 TokenKind::Error(format!("unterminated bracket identifier at byte {}", start))
671 }
672 }
673
674 fn lex_blob(&mut self) -> TokenKind {
676 let start = self.pos;
677 self.advance(); self.advance(); let hex_start = self.pos;
681 let remaining = &self.src[self.pos..];
682 if let Some(offset) = memchr(b'\'', remaining) {
683 let hex_bytes = &self.src[hex_start..hex_start + offset];
684 self.advance_by(offset);
685 self.advance(); if hex_bytes.len() % 2 != 0 {
689 return TokenKind::Error(format!(
690 "blob literal has odd number of hex digits at byte {}",
691 start
692 ));
693 }
694
695 let mut bytes = Vec::with_capacity(hex_bytes.len() / 2);
698 for pair in hex_bytes.chunks_exact(2) {
699 let hi = hex_digit(pair[0]);
700 let lo = hex_digit(pair[1]);
701 match (hi, lo) {
702 (Some(h), Some(l)) => bytes.push((h << 4) | l),
703 _ => {
704 return TokenKind::Error(format!(
705 "invalid hex in blob literal at byte {start}"
706 ));
707 }
708 }
709 }
710 TokenKind::Blob(bytes)
711 } else {
712 self.pos = self.src.len();
713 TokenKind::Error(format!("unterminated blob literal at byte {}", start))
714 }
715 }
716
717 fn lex_number(&mut self) -> TokenKind {
719 let start = self.pos;
720
721 if self.src[self.pos] == b'0' && self.peek_at(1).is_some_and(|c| c == b'x' || c == b'X') {
723 self.advance(); self.advance(); let hex_start = self.pos;
726 while self.pos < self.src.len() && self.src[self.pos].is_ascii_hexdigit() {
727 self.advance();
728 }
729 if self.pos == hex_start {
730 return TokenKind::Error("empty hex literal".to_owned());
731 }
732 let hex_str = String::from_utf8_lossy(&self.src[hex_start..self.pos]);
733 let significant = hex_str.trim_start_matches('0');
737 if significant.len() > 16 {
738 return TokenKind::Error(format!("hex literal out of range at byte {start}"));
739 }
740 let parse_str = if significant.is_empty() {
741 "0"
742 } else {
743 significant
744 };
745 return match u64::from_str_radix(parse_str, 16) {
748 Ok(v) => {
749 #[allow(clippy::cast_possible_wrap)]
750 let i = v as i64;
751 TokenKind::Integer(i)
752 }
753 Err(_) => TokenKind::Error(format!("hex literal out of range at byte {start}")),
754 };
755 }
756
757 let mut is_float = false;
759
760 while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
762 self.advance();
763 }
764
765 let is_valid_exponent = |lexer: &Self, mut offset: usize| -> bool {
767 if let Some(c) = lexer.peek_at(offset) {
768 if c == b'e' || c == b'E' {
769 offset += 1;
770 if let Some(s) = lexer.peek_at(offset) {
771 if s == b'+' || s == b'-' {
772 offset += 1;
773 }
774 }
775 if let Some(d) = lexer.peek_at(offset) {
776 return d.is_ascii_digit();
777 }
778 }
779 }
780 false
781 };
782
783 if self.pos < self.src.len()
785 && self.src[self.pos] == b'.'
786 && (self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) || is_valid_exponent(self, 1))
787 {
788 is_float = true;
789 self.advance(); while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
791 self.advance();
792 }
793 } else if self.pos < self.src.len()
794 && self.src[self.pos] == b'.'
795 && start < self.pos && !self.peek_at(1).is_some_and(|c| c.is_ascii_alphanumeric() || c == b'_')
797 {
798 is_float = true;
800 self.advance(); }
802
803 if self.src[start] == b'.' {
805 is_float = true;
806 }
807
808 if is_valid_exponent(self, 0) {
810 is_float = true;
811 self.advance(); if self.pos < self.src.len()
813 && (self.src[self.pos] == b'+' || self.src[self.pos] == b'-')
814 {
815 self.advance();
816 }
817 while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
818 self.advance();
819 }
820 }
821
822 if let Some(c) = self.peek() {
825 if c.is_ascii_alphabetic()
826 || c == b'_'
827 || (c == b'.'
828 && self
829 .peek_at(1)
830 .is_some_and(|n| n.is_ascii_alphabetic() || n == b'_'))
831 {
832 let err_start = start;
833 while self.pos < self.src.len() {
834 let ch = self.src[self.pos];
835 if ch.is_ascii_alphanumeric() || ch == b'_' || ch == b'.' {
836 self.advance();
837 } else {
838 break;
839 }
840 }
841 let err_text = String::from_utf8_lossy(&self.src[err_start..self.pos]);
842 return TokenKind::Error(format!("unrecognized token: \"{err_text}\""));
843 }
844 }
845
846 let text = String::from_utf8_lossy(&self.src[start..self.pos]);
847 if is_float {
848 let clamp = |v: f64| -> f64 { if v.is_finite() { v } else { f64::MAX } };
849 match text.parse::<f64>() {
850 Ok(v) => TokenKind::Float(clamp(v)),
851 Err(_) => {
852 let mut text_fixed = text.clone().into_owned();
854 if text_fixed.starts_with(".e") || text_fixed.starts_with(".E") {
855 text_fixed.insert(0, '0');
856 }
857 match text_fixed.parse::<f64>() {
858 Ok(v) => TokenKind::Float(clamp(v)),
859 Err(_) => TokenKind::Error(format!("invalid float: {text}")),
860 }
861 }
862 }
863 } else {
864 match text.parse::<i64>() {
865 Ok(v) => TokenKind::Integer(v),
866 Err(_) => {
867 TokenKind::OversizedInt(text.into_owned())
870 }
871 }
872 }
873 }
874
875 fn lex_identifier(&mut self) -> TokenKind {
877 let start = self.pos;
878 self.advance(); while self.pos < self.src.len() {
881 let ch = self.src[self.pos];
882 if ch.is_ascii_alphanumeric() || ch == b'_' || ch >= 0x80 {
883 self.advance();
884 } else {
885 break;
886 }
887 }
888
889 let ident_bytes = &self.src[start..self.pos];
890
891 if let Some(kw) = TokenKind::lookup_keyword_bytes(ident_bytes) {
893 kw
894 } else {
895 let text = String::from_utf8_lossy(ident_bytes);
896 TokenKind::Id(self.interner.intern(&text))
897 }
898 }
899
900 fn lex_question(&mut self) -> TokenKind {
902 self.advance(); if self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
904 let num_start = self.pos;
905 while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
906 self.advance();
907 }
908 let text = String::from_utf8_lossy(&self.src[num_start..self.pos]);
909 match text.parse::<u32>() {
910 Ok(n) if (1..=MAX_VARIABLE_NUMBER).contains(&n) => TokenKind::QuestionNum(n),
911 Ok(n) => TokenKind::Error(format!(
912 "variable number must be between ?1 and ?{MAX_VARIABLE_NUMBER}, got ?{n}"
913 )),
914 Err(_) => TokenKind::Error("invalid parameter number".to_owned()),
915 }
916 } else {
917 TokenKind::Question
918 }
919 }
920
921 fn lex_alpha_param(&mut self, prefix: char, constructor: fn(String) -> TokenKind) -> TokenKind {
922 self.advance(); let name_start = self.pos;
924 while self.pos < self.src.len() {
925 let ch = self.src[self.pos];
926 if ch.is_ascii_alphanumeric() || ch == b'_' || ch >= 0x80 {
927 self.advance();
928 } else if ch == b':' && self.peek_at(1) == Some(b':') {
929 self.advance();
930 self.advance();
931 } else if ch == b'(' {
932 self.advance();
933 while self.pos < self.src.len() && self.src[self.pos] != b')' {
934 self.advance();
935 }
936 if self.pos >= self.src.len() || self.src[self.pos] != b')' {
937 let name = String::from_utf8_lossy(&self.src[name_start..self.pos]);
938 return TokenKind::Error(format!("unrecognized token: \"{prefix}{name}\""));
939 }
940 self.advance();
941 break; } else {
943 break;
944 }
945 }
946 if self.pos == name_start {
947 return TokenKind::Error(format!("empty parameter name after '{prefix}'"));
948 }
949 let name = String::from_utf8_lossy(&self.src[name_start..self.pos]).into_owned();
950 constructor(name)
951 }
952
953 fn lex_colon_param(&mut self) -> TokenKind {
955 self.lex_alpha_param(':', TokenKind::ColonParam)
956 }
957
958 fn lex_at_param(&mut self) -> TokenKind {
960 self.lex_alpha_param('@', TokenKind::AtParam)
961 }
962
963 fn lex_dollar_param(&mut self) -> TokenKind {
965 self.lex_alpha_param('$', TokenKind::DollarParam)
966 }
967
968 fn lex_minus_or_arrow(&mut self) -> TokenKind {
974 self.advance(); if self.peek() == Some(b'>') {
976 self.advance(); if self.peek() == Some(b'>') {
978 self.advance(); TokenKind::DoubleArrow
980 } else {
981 TokenKind::Arrow
982 }
983 } else {
984 TokenKind::Minus
985 }
986 }
987
988 fn lex_lt(&mut self) -> TokenKind {
990 self.advance(); match self.peek() {
992 Some(b'=') => {
993 self.advance();
994 TokenKind::Le
995 }
996 Some(b'>') => {
997 self.advance();
998 TokenKind::LtGt
999 }
1000 Some(b'<') => {
1001 self.advance();
1002 TokenKind::ShiftLeft
1003 }
1004 _ => TokenKind::Lt,
1005 }
1006 }
1007
1008 fn lex_gt(&mut self) -> TokenKind {
1010 self.advance(); match self.peek() {
1012 Some(b'=') => {
1013 self.advance();
1014 TokenKind::Ge
1015 }
1016 Some(b'>') => {
1017 self.advance();
1018 TokenKind::ShiftRight
1019 }
1020 _ => TokenKind::Gt,
1021 }
1022 }
1023
1024 fn lex_eq(&mut self) -> TokenKind {
1026 self.advance(); if self.peek() == Some(b'=') {
1028 self.advance();
1029 TokenKind::EqEq
1030 } else {
1031 TokenKind::Eq
1032 }
1033 }
1034
1035 fn lex_bang(&mut self) -> TokenKind {
1037 self.advance(); if self.peek() == Some(b'=') {
1039 self.advance();
1040 TokenKind::Ne
1041 } else {
1042 TokenKind::Error("unexpected '!', did you mean '!='?".to_owned())
1043 }
1044 }
1045
1046 fn lex_pipe(&mut self) -> TokenKind {
1048 self.advance(); if self.peek() == Some(b'|') {
1050 self.advance();
1051 TokenKind::Concat
1052 } else {
1053 TokenKind::Pipe
1054 }
1055 }
1056}
1057
1058const fn hex_digit(b: u8) -> Option<u8> {
1061 match b {
1062 b'0'..=b'9' => Some(b - b'0'),
1063 b'a'..=b'f' => Some(b - b'a' + 10),
1064 b'A'..=b'F' => Some(b - b'A' + 10),
1065 _ => None,
1066 }
1067}
1068
1069#[cfg(test)]
1070mod tests {
1071 use super::*;
1072
1073 fn lex(src: &str) -> Vec<Token> {
1074 Lexer::tokenize(src)
1075 }
1076
1077 fn kinds(src: &str) -> Vec<TokenKind> {
1078 lex(src).into_iter().map(|t| t.kind).collect()
1079 }
1080
1081 #[test]
1082 fn test_lex_integer_literals() {
1083 let tokens = kinds("42 0 0xFF");
1084 assert_eq!(
1085 tokens,
1086 vec![
1087 TokenKind::Integer(42),
1088 TokenKind::Integer(0),
1089 TokenKind::Integer(255),
1090 TokenKind::Eof,
1091 ]
1092 );
1093 }
1094
1095 #[test]
1096 fn test_tokenize_into_reuses_caller_owned_capacity() {
1097 let mut scratch = Vec::new();
1098 Lexer::tokenize_into(
1099 "SELECT 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz';",
1100 &mut scratch,
1101 );
1102 let warmed_capacity = scratch.capacity();
1103 assert!(
1104 warmed_capacity > 0,
1105 "warm parse should allocate token scratch"
1106 );
1107
1108 Lexer::tokenize_into("SELECT 1;", &mut scratch);
1109 assert_eq!(
1110 scratch.capacity(),
1111 warmed_capacity,
1112 "smaller follow-up parse should reuse the warmed token buffer",
1113 );
1114 assert_eq!(
1115 scratch.last().map(|token| &token.kind),
1116 Some(&TokenKind::Eof),
1117 "tokenize_into should still terminate with EOF in reused scratch",
1118 );
1119 }
1120
1121 #[test]
1122 fn test_lex_float_literals() {
1123 let tokens = kinds("3.14 1e10 .5 1.0e-3 0.0");
1124 let expected = 3.0 + 0.14;
1127 assert!(matches!(
1128 tokens[0],
1129 TokenKind::Float(v) if (v - expected).abs() < 1e-10
1130 ));
1131 assert!(matches!(tokens[1], TokenKind::Float(v) if (v - 1e10).abs() < 1.0));
1132 assert!(matches!(tokens[2], TokenKind::Float(v) if (v - 0.5).abs() < 1e-10));
1133 assert!(matches!(tokens[3], TokenKind::Float(v) if (v - 0.001).abs() < 1e-10));
1134 assert!(matches!(tokens[4], TokenKind::Float(v) if v.abs() < 1e-10));
1135 assert_eq!(tokens[5], TokenKind::Eof);
1136 }
1137
1138 #[test]
1139 fn test_lex_string_literals() {
1140 let tokens = kinds("'hello' 'it''s' ''");
1141 assert_eq!(tokens[0], TokenKind::String("hello".to_owned()));
1142 assert_eq!(tokens[1], TokenKind::String("it's".to_owned()));
1143 assert_eq!(tokens[2], TokenKind::String(String::new()));
1144 assert_eq!(tokens[3], TokenKind::Eof);
1145 }
1146
1147 #[test]
1148 fn test_lex_blob_literals() {
1149 let tokens = kinds("X'CAFE' x'00ff' X''");
1150 assert_eq!(tokens[0], TokenKind::Blob(vec![0xCA, 0xFE]));
1151 assert_eq!(tokens[1], TokenKind::Blob(vec![0x00, 0xFF]));
1152 assert_eq!(tokens[2], TokenKind::Blob(vec![]));
1153 assert_eq!(tokens[3], TokenKind::Eof);
1154 }
1155
1156 #[test]
1157 fn test_lex_blob_odd_hex_error() {
1158 let tokens = kinds("X'CAF'");
1159 assert!(matches!(tokens[0], TokenKind::Error(_)));
1160 }
1161
1162 #[test]
1163 fn test_lex_blob_non_ascii_no_panic() {
1164 let tokens = kinds("X'U\u{05fc} '");
1167 assert!(matches!(tokens[0], TokenKind::Error(_)));
1168
1169 let tokens2 = kinds("X'GG'");
1171 assert!(matches!(tokens2[0], TokenKind::Error(_)));
1172 }
1173
1174 #[test]
1175 fn test_lex_variables() {
1176 let tokens = kinds("?1 :name @param $var ?");
1177 assert_eq!(tokens[0], TokenKind::QuestionNum(1));
1178 assert_eq!(tokens[1], TokenKind::ColonParam("name".to_owned()));
1179 assert_eq!(tokens[2], TokenKind::AtParam("param".to_owned()));
1180 assert_eq!(tokens[3], TokenKind::DollarParam("var".to_owned()));
1181 assert_eq!(tokens[4], TokenKind::Question);
1182 assert_eq!(tokens[5], TokenKind::Eof);
1183 }
1184
1185 #[test]
1186 fn test_lex_quoted_identifiers() {
1187 let tokens = kinds("\"table_name\" [column] `backtick`");
1188 assert_eq!(tokens[0], TokenKind::QuotedId("table_name".into(), true));
1189 assert_eq!(tokens[1], TokenKind::QuotedId("column".into(), false));
1190 assert_eq!(tokens[2], TokenKind::QuotedId("backtick".into(), false));
1191 }
1192
1193 #[test]
1194 fn test_lex_dqs_flag() {
1195 let tokens = kinds("\"hello\"");
1196 assert_eq!(tokens[0], TokenKind::QuotedId("hello".into(), true));
1198 }
1199
1200 #[test]
1201 fn test_lex_keywords() {
1202 let tokens = kinds("SELECT FROM WHERE INSERT CREATE TABLE CONCURRENT");
1203 assert_eq!(tokens[0], TokenKind::KwSelect);
1204 assert_eq!(tokens[1], TokenKind::KwFrom);
1205 assert_eq!(tokens[2], TokenKind::KwWhere);
1206 assert_eq!(tokens[3], TokenKind::KwInsert);
1207 assert_eq!(tokens[4], TokenKind::KwCreate);
1208 assert_eq!(tokens[5], TokenKind::KwTable);
1209 assert_eq!(tokens[6], TokenKind::KwConcurrent);
1210
1211 let tokens2 = kinds("select from where");
1213 assert_eq!(tokens2[0], TokenKind::KwSelect);
1214 assert_eq!(tokens2[1], TokenKind::KwFrom);
1215 assert_eq!(tokens2[2], TokenKind::KwWhere);
1216 }
1217
1218 #[test]
1219 fn test_lex_operators() {
1220 let tokens = kinds("+ - * / % & | ~ << >> = < <= > >= == != <> || -> ->>");
1221 let expected = vec![
1222 TokenKind::Plus,
1223 TokenKind::Minus,
1224 TokenKind::Star,
1225 TokenKind::Slash,
1226 TokenKind::Percent,
1227 TokenKind::Ampersand,
1228 TokenKind::Pipe,
1229 TokenKind::Tilde,
1230 TokenKind::ShiftLeft,
1231 TokenKind::ShiftRight,
1232 TokenKind::Eq,
1233 TokenKind::Lt,
1234 TokenKind::Le,
1235 TokenKind::Gt,
1236 TokenKind::Ge,
1237 TokenKind::EqEq,
1238 TokenKind::Ne,
1239 TokenKind::LtGt,
1240 TokenKind::Concat,
1241 TokenKind::Arrow,
1242 TokenKind::DoubleArrow,
1243 TokenKind::Eof,
1244 ];
1245 assert_eq!(tokens, expected);
1246 }
1247
1248 #[test]
1249 fn test_lex_eq_vs_eqeq() {
1250 let tokens = kinds("= ==");
1251 assert_eq!(tokens[0], TokenKind::Eq);
1252 assert_eq!(tokens[1], TokenKind::EqEq);
1253 }
1254
1255 #[test]
1256 fn test_lex_ne_vs_ltgt() {
1257 let tokens = kinds("!= <>");
1258 assert_eq!(tokens[0], TokenKind::Ne);
1259 assert_eq!(tokens[1], TokenKind::LtGt);
1260 }
1261
1262 #[test]
1263 fn test_lex_error_unterminated_string() {
1264 let tokens = kinds("'hello");
1265 assert!(matches!(tokens[0], TokenKind::Error(_)));
1266 }
1267
1268 #[test]
1269 fn test_lex_line_column_tracking() {
1270 let tokens = lex("SELECT\n a,\n b");
1271 assert_eq!(tokens[0].line, 1);
1272 assert_eq!(tokens[0].col, 1);
1273 assert_eq!(tokens[1].line, 2);
1275 assert_eq!(tokens[1].col, 3);
1276 assert_eq!(tokens[2].line, 2);
1278 assert_eq!(tokens[2].col, 4);
1279 assert_eq!(tokens[3].line, 3);
1281 assert_eq!(tokens[3].col, 3);
1282 }
1283
1284 #[test]
1285 fn test_lex_whitespace_and_comments_skipped() {
1286 let tokens = kinds("SELECT -- this is a comment\n a /* block */ FROM b");
1287 assert_eq!(tokens[0], TokenKind::KwSelect);
1288 assert_eq!(tokens[1], TokenKind::Id("a".into()));
1289 assert_eq!(tokens[2], TokenKind::KwFrom);
1290 assert_eq!(tokens[3], TokenKind::Id("b".into()));
1291 assert_eq!(tokens[4], TokenKind::Eof);
1292 }
1293
1294 #[test]
1295 fn test_lex_hex_large_values() {
1296 let tokens = kinds("0xFFFFFFFFFFFFFFFF");
1299 assert_eq!(tokens[0], TokenKind::Integer(-1));
1300
1301 let tokens = kinds("0x8000000000000000");
1303 assert_eq!(tokens[0], TokenKind::Integer(i64::MIN));
1304
1305 let tokens = kinds("0x7FFFFFFFFFFFFFFF");
1307 assert_eq!(tokens[0], TokenKind::Integer(i64::MAX));
1308 }
1309
1310 #[test]
1311 fn test_lex_hex_overflow_17_digits_rejects() {
1312 let tokens = kinds("0x10000000000000000");
1315 assert!(
1316 matches!(&tokens[0], TokenKind::Error(msg) if msg.contains("out of range")),
1317 "expected error for 17-digit hex, got {:?}",
1318 tokens[0]
1319 );
1320 }
1321
1322 #[test]
1323 fn test_lex_hex_leading_zeros_accepted() {
1324 let tokens = kinds("0x00000000000000001");
1327 assert_eq!(tokens[0], TokenKind::Integer(1));
1328 }
1329
1330 #[test]
1331 fn test_lex_number_hex() {
1332 let tokens = kinds("0x1A 0Xff 0x0");
1333 assert_eq!(tokens[0], TokenKind::Integer(26));
1334 assert_eq!(tokens[1], TokenKind::Integer(255));
1335 assert_eq!(tokens[2], TokenKind::Integer(0));
1336 assert_eq!(tokens[3], TokenKind::Eof);
1337 }
1338
1339 #[test]
1340 fn test_lex_number_unrecognized() {
1341 let tokens = kinds("123a 123.a");
1342 assert!(
1343 matches!(tokens[0], TokenKind::Error(ref e) if e.contains("unrecognized token: \"123a\""))
1344 );
1345 assert!(
1346 matches!(tokens[1], TokenKind::Error(ref e) if e.contains("unrecognized token: \"123.a\""))
1347 );
1348 }
1349
1350 #[test]
1351 fn test_lex_number_hex_invalid() {
1352 let tokens = kinds("0x");
1353 assert!(matches!(tokens[0], TokenKind::Error(_)));
1354 }
1355
1356 #[test]
1357 fn test_lex_positional_params() {
1358 let tokens = kinds("? ?123");
1359 assert_eq!(tokens[0], TokenKind::Question);
1360 assert_eq!(tokens[1], TokenKind::QuestionNum(123));
1361 assert_eq!(tokens[2], TokenKind::Eof);
1362 }
1363
1364 #[test]
1365 fn test_lex_positional_params_reject_zero_and_out_of_range() {
1366 let tokens = kinds("?0 ?32767");
1367 assert!(
1368 matches!(tokens[0], TokenKind::Error(ref e) if e.contains("between ?1 and ?32766")),
1369 "expected ?0 to be rejected, got {:?}",
1370 tokens[0]
1371 );
1372 assert!(
1373 matches!(tokens[1], TokenKind::Error(ref e) if e.contains("between ?1 and ?32766")),
1374 "expected ?32767 to be rejected, got {:?}",
1375 tokens[1]
1376 );
1377 assert_eq!(tokens[2], TokenKind::Eof);
1378 }
1379
1380 #[test]
1381 fn test_lex_named_params() {
1382 let tokens = kinds(":foo @bar $baz_123");
1383 assert_eq!(tokens[0], TokenKind::ColonParam("foo".to_owned()));
1384 assert_eq!(tokens[1], TokenKind::AtParam("bar".to_owned()));
1385 assert_eq!(tokens[2], TokenKind::DollarParam("baz_123".to_owned()));
1386 assert_eq!(tokens[3], TokenKind::Eof);
1387 }
1388
1389 #[test]
1390 fn test_lex_named_params_with_tcl_syntax() {
1391 let tokens = kinds("$::foo(bar) :a::b");
1392 assert_eq!(tokens[0], TokenKind::DollarParam("::foo(bar)".to_owned()));
1393 assert_eq!(tokens[1], TokenKind::ColonParam("a::b".to_owned()));
1394 assert_eq!(tokens[2], TokenKind::Eof);
1395 }
1396
1397 #[test]
1398 fn test_lex_named_params_with_unclosed_tcl_array_syntax() {
1399 let tokens = kinds("$::foo(bar");
1400 assert!(
1401 matches!(tokens[0], TokenKind::Error(ref e) if e.contains("unrecognized token")),
1402 "expected unterminated Tcl-style parameter to be rejected, got {:?}",
1403 tokens[0]
1404 );
1405 assert_eq!(tokens[1], TokenKind::Eof);
1406 }
1407
1408 fn histogram_total(hist: &TokenizeDurationSecondsHistogram) -> u64 {
1409 hist.le_100us + hist.le_250us + hist.le_500us + hist.le_1ms + hist.le_5ms + hist.gt_5ms
1410 }
1411
1412 #[test]
1413 fn test_tokenize_metrics_accumulate_tokens_and_histogram_samples() {
1414 let prev_metrics_enabled = tokenize_metrics_enabled();
1415 reset_tokenize_metrics();
1416 set_tokenize_metrics_enabled(true);
1417
1418 let first = lex("SELECT 1;");
1419 let second = lex("SELECT 2;");
1420
1421 let expected_total_tokens = u64::try_from(first.len() + second.len()).unwrap_or(u64::MAX);
1422 let snap = tokenize_metrics_snapshot();
1423 assert_eq!(snap.fsqlite_tokenize_tokens_total, expected_total_tokens);
1424 assert_eq!(snap.fsqlite_tokenize_duration_seconds_count, 2);
1425 assert_eq!(
1426 histogram_total(&snap.fsqlite_tokenize_duration_seconds),
1427 snap.fsqlite_tokenize_duration_seconds_count
1428 );
1429
1430 set_tokenize_metrics_enabled(prev_metrics_enabled);
1431 reset_tokenize_metrics();
1432 }
1433
1434 #[test]
1435 fn test_tokenize_metrics_reset_clears_all_fields() {
1436 let prev_metrics_enabled = tokenize_metrics_enabled();
1437 reset_tokenize_metrics();
1438 set_tokenize_metrics_enabled(true);
1439 let _ = lex("SELECT 42;");
1440
1441 let before = tokenize_metrics_snapshot();
1442 assert!(before.fsqlite_tokenize_tokens_total > 0);
1443 assert!(before.fsqlite_tokenize_duration_seconds_count > 0);
1444
1445 reset_tokenize_metrics();
1446 let after = tokenize_metrics_snapshot();
1447 assert_eq!(after.fsqlite_tokenize_tokens_total, 0);
1448 assert_eq!(after.fsqlite_tokenize_duration_seconds_count, 0);
1449 assert_eq!(after.fsqlite_tokenize_duration_seconds_sum_micros, 0);
1450 assert_eq!(histogram_total(&after.fsqlite_tokenize_duration_seconds), 0);
1451
1452 set_tokenize_metrics_enabled(prev_metrics_enabled);
1453 }
1454
1455 #[test]
1456 fn test_tokenize_metrics_can_be_disabled_off_hot_path() {
1457 let prev_metrics_enabled = tokenize_metrics_enabled();
1458 reset_tokenize_metrics();
1459 set_tokenize_metrics_enabled(false);
1460
1461 let _ = lex("SELECT 99;");
1462
1463 let snap = tokenize_metrics_snapshot();
1464 assert_eq!(snap.fsqlite_tokenize_tokens_total, 0);
1465 assert_eq!(snap.fsqlite_tokenize_duration_seconds_count, 0);
1466 assert_eq!(snap.fsqlite_tokenize_duration_seconds_sum_micros, 0);
1467 assert_eq!(histogram_total(&snap.fsqlite_tokenize_duration_seconds), 0);
1468
1469 set_tokenize_metrics_enabled(prev_metrics_enabled);
1470 reset_tokenize_metrics();
1471 }
1472}