1use fsqlite_ast::Span;
7use fsqlite_types::limits::MAX_VARIABLE_NUMBER;
8use memchr::memchr;
9use std::sync::atomic::{AtomicU64, Ordering};
10use std::time::Instant;
11use tracing::Level;
12
13use crate::token::{Token, TokenKind};
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
17pub struct TokenizeDurationSecondsHistogram {
18 pub le_100us: u64,
20 pub le_250us: u64,
22 pub le_500us: u64,
24 pub le_1ms: u64,
26 pub le_5ms: u64,
28 pub gt_5ms: u64,
30}
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
34pub struct TokenizeMetricsSnapshot {
35 pub fsqlite_tokenize_tokens_total: u64,
37 pub fsqlite_tokenize_duration_seconds: TokenizeDurationSecondsHistogram,
39 pub fsqlite_tokenize_duration_seconds_count: u64,
41 pub fsqlite_tokenize_duration_seconds_sum_micros: u64,
43}
44
45static FSQLITE_TOKENIZE_TOKENS_TOTAL: AtomicU64 = AtomicU64::new(0);
46static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US: AtomicU64 = AtomicU64::new(0);
47static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US: AtomicU64 = AtomicU64::new(0);
48static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US: AtomicU64 = AtomicU64::new(0);
49static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS: AtomicU64 = AtomicU64::new(0);
50static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS: AtomicU64 = AtomicU64::new(0);
51static FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS: AtomicU64 = AtomicU64::new(0);
52static FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT: AtomicU64 = AtomicU64::new(0);
53static FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS: AtomicU64 = AtomicU64::new(0);
54
55fn saturating_u64_from_usize(value: usize) -> u64 {
56 u64::try_from(value).unwrap_or(u64::MAX)
57}
58
59fn saturating_u64_from_u128(value: u128) -> u64 {
60 u64::try_from(value).unwrap_or(u64::MAX)
61}
62
63fn record_tokenize_metrics(token_count: usize, elapsed_micros: u64) {
64 FSQLITE_TOKENIZE_TOKENS_TOTAL
65 .fetch_add(saturating_u64_from_usize(token_count), Ordering::Relaxed);
66 FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT.fetch_add(1, Ordering::Relaxed);
67 FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS.fetch_add(elapsed_micros, Ordering::Relaxed);
68
69 let bucket = match elapsed_micros {
70 0..=100 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US,
71 101..=250 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US,
72 251..=500 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US,
73 501..=1_000 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS,
74 1_001..=5_000 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS,
75 _ => &FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS,
76 };
77 bucket.fetch_add(1, Ordering::Relaxed);
78}
79
80#[must_use]
82pub fn tokenize_metrics_snapshot() -> TokenizeMetricsSnapshot {
83 TokenizeMetricsSnapshot {
84 fsqlite_tokenize_tokens_total: FSQLITE_TOKENIZE_TOKENS_TOTAL.load(Ordering::Relaxed),
85 fsqlite_tokenize_duration_seconds: TokenizeDurationSecondsHistogram {
86 le_100us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US.load(Ordering::Relaxed),
87 le_250us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US.load(Ordering::Relaxed),
88 le_500us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US.load(Ordering::Relaxed),
89 le_1ms: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS.load(Ordering::Relaxed),
90 le_5ms: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS.load(Ordering::Relaxed),
91 gt_5ms: FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS.load(Ordering::Relaxed),
92 },
93 fsqlite_tokenize_duration_seconds_count: FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT
94 .load(Ordering::Relaxed),
95 fsqlite_tokenize_duration_seconds_sum_micros: FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS
96 .load(Ordering::Relaxed),
97 }
98}
99
100pub fn reset_tokenize_metrics() {
102 FSQLITE_TOKENIZE_TOKENS_TOTAL.store(0, Ordering::Relaxed);
103 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US.store(0, Ordering::Relaxed);
104 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US.store(0, Ordering::Relaxed);
105 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US.store(0, Ordering::Relaxed);
106 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS.store(0, Ordering::Relaxed);
107 FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS.store(0, Ordering::Relaxed);
108 FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS.store(0, Ordering::Relaxed);
109 FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT.store(0, Ordering::Relaxed);
110 FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS.store(0, Ordering::Relaxed);
111}
112
113pub struct Lexer<'a> {
115 src: &'a [u8],
117 pos: usize,
119 line: u32,
121 col: u32,
123 trace_chars: bool,
125}
126
127impl<'a> Lexer<'a> {
128 fn log_token(token: &Token) {
129 tracing::debug!(
130 target: "fsqlite.parse",
131 token = ?token.kind,
132 start = token.span.start,
133 end = token.span.end,
134 line = token.line,
135 col = token.col,
136 "tokenized token"
137 );
138 }
139
140 #[must_use]
142 pub fn new(source: &'a str) -> Self {
143 Self {
144 src: source.as_bytes(),
145 pos: 0,
146 line: 1,
147 col: 1,
148 trace_chars: tracing::enabled!(target: "fsqlite.parse", Level::TRACE),
149 }
150 }
151
152 #[must_use]
154 pub fn tokenize(source: &'a str) -> Vec<Token> {
155 let input_bytes = source.len();
156 let span = tracing::span!(
157 target: "fsqlite.parse",
158 Level::TRACE,
159 "tokenize",
160 token_count = tracing::field::Empty,
161 input_bytes,
162 elapsed_us = tracing::field::Empty,
163 );
164 let _guard = span.enter();
165 let started = Instant::now();
166
167 let mut lexer = Self::new(source);
168 let mut tokens = Vec::with_capacity(input_bytes / 4 + 1);
169 loop {
170 let tok = lexer.next_token();
171 let is_eof = tok.kind == TokenKind::Eof;
172 tokens.push(tok);
173 if is_eof {
174 break;
175 }
176 }
177
178 let elapsed = started.elapsed();
179 let elapsed_us = saturating_u64_from_u128(elapsed.as_micros());
180 span.record("token_count", saturating_u64_from_usize(tokens.len()));
181 span.record("elapsed_us", elapsed_us);
182 record_tokenize_metrics(tokens.len(), elapsed_us);
183 tokens
184 }
185
186 #[must_use]
188 pub fn metrics_snapshot() -> TokenizeMetricsSnapshot {
189 tokenize_metrics_snapshot()
190 }
191
192 pub fn reset_metrics() {
194 reset_tokenize_metrics();
195 }
196
197 pub fn next_token(&mut self) -> Token {
199 self.skip_whitespace_and_comments();
200
201 if self.pos >= self.src.len() {
202 let token = self.make_token(TokenKind::Eof, self.pos, self.pos);
203 Self::log_token(&token);
204 return token;
205 }
206
207 let start = self.pos;
208 let start_line = self.line;
209 let start_col = self.col;
210 let ch = self.src[self.pos];
211
212 let kind = match ch {
213 b'\'' => self.lex_string(),
215
216 b'"' => self.lex_double_quoted_id(),
218
219 b'`' => self.lex_backtick_id(),
221
222 b'[' => self.lex_bracket_id(),
224
225 b'X' | b'x' if self.peek_at(1) == Some(b'\'') => self.lex_blob(),
227
228 b'0'..=b'9' => self.lex_number(),
230 b'.' if self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) => self.lex_number(),
231
232 b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF => self.lex_identifier(),
234
235 b'?' => self.lex_question(),
237 b':' => self.lex_colon_param(),
238 b'@' => self.lex_at_param(),
239 b'$' => self.lex_dollar_param(),
240
241 b'+' => {
243 self.advance();
244 TokenKind::Plus
245 }
246 b'*' => {
247 self.advance();
248 TokenKind::Star
249 }
250 b'/' => {
251 self.advance();
252 TokenKind::Slash
253 }
254 b'%' => {
255 self.advance();
256 TokenKind::Percent
257 }
258 b'&' => {
259 self.advance();
260 TokenKind::Ampersand
261 }
262 b'~' => {
263 self.advance();
264 TokenKind::Tilde
265 }
266 b',' => {
267 self.advance();
268 TokenKind::Comma
269 }
270 b';' => {
271 self.advance();
272 TokenKind::Semicolon
273 }
274 b'(' => {
275 self.advance();
276 TokenKind::LeftParen
277 }
278 b')' => {
279 self.advance();
280 TokenKind::RightParen
281 }
282 b'.' => {
283 self.advance();
284 TokenKind::Dot
285 }
286
287 b'-' => self.lex_minus_or_arrow(),
289 b'<' => self.lex_lt(),
290 b'>' => self.lex_gt(),
291 b'=' => self.lex_eq(),
292 b'!' => self.lex_bang(),
293 b'|' => self.lex_pipe(),
294
295 _ => {
296 self.advance();
297 let s = String::from_utf8_lossy(&self.src[start..self.pos]).into_owned();
298 TokenKind::Error(format!("unexpected character: {s}"))
299 }
300 };
301
302 let token = Token {
303 kind,
304 #[allow(clippy::cast_possible_truncation)]
305 span: Span::new(start as u32, self.pos as u32),
306 line: start_line,
307 col: start_col,
308 };
309
310 Self::log_token(&token);
311 token
312 }
313
314 #[allow(clippy::cast_possible_truncation)]
319 fn advance_by(&mut self, n: usize) {
320 if n == 0 {
321 return;
322 }
323 let end = self.pos + n;
324 let slice = &self.src[self.pos..end];
325 #[allow(clippy::naive_bytecount)]
326 let newlines = slice.iter().filter(|&&b| b == b'\n').count();
327 if newlines > 0 {
328 self.line += newlines as u32;
329 let last_nl = slice.iter().rposition(|&b| b == b'\n').unwrap_or(0);
330 self.col = (n - last_nl) as u32;
331 } else {
332 self.col += n as u32;
333 }
334 self.pos = end;
335 }
336
337 fn advance(&mut self) -> u8 {
338 let pos = self.pos;
339 let line = self.line;
340 let col = self.col;
341 let ch = self.src[self.pos];
342 self.pos += 1;
343 if ch == b'\n' {
344 self.line += 1;
345 self.col = 1;
346 } else {
347 self.col += 1;
348 }
349 if self.trace_chars {
350 tracing::trace!(
351 target: "fsqlite.parse",
352 byte = ch,
353 pos,
354 line,
355 col,
356 "tokenize char"
357 );
358 }
359 ch
360 }
361
362 fn peek(&self) -> Option<u8> {
363 self.src.get(self.pos).copied()
364 }
365
366 fn peek_at(&self, offset: usize) -> Option<u8> {
367 self.src.get(self.pos + offset).copied()
368 }
369
370 #[allow(clippy::cast_possible_truncation)]
371 fn make_token(&self, kind: TokenKind, start: usize, end: usize) -> Token {
372 Token {
373 kind,
374 span: Span::new(start as u32, end as u32),
375 line: self.line,
376 col: self.col,
377 }
378 }
379
380 fn skip_whitespace_and_comments(&mut self) {
382 loop {
383 let mut ws_len = 0;
385 while self.pos + ws_len < self.src.len()
386 && self.src[self.pos + ws_len].is_ascii_whitespace()
387 {
388 ws_len += 1;
389 }
390 if ws_len > 0 {
391 self.advance_by(ws_len);
392 }
393
394 if self.pos >= self.src.len() {
395 break;
396 }
397
398 if self.src[self.pos] == b'-' && self.peek_at(1) == Some(b'-') {
400 self.advance(); self.advance(); while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
403 self.advance();
404 }
405 continue;
406 }
407
408 if self.src[self.pos] == b'/' && self.peek_at(1) == Some(b'*') {
410 self.advance(); self.advance(); let closed = loop {
413 if self.pos >= self.src.len() {
414 break false;
415 }
416 if self.src[self.pos] == b'*' && self.peek_at(1) == Some(b'/') {
417 self.advance();
418 self.advance();
419 break true;
420 }
421 self.advance();
422 };
423 if !closed {
424 self.pos = self.src.len();
426 }
427 continue;
428 }
429
430 break;
431 }
432 }
433
434 fn lex_string(&mut self) -> TokenKind {
439 let start = self.pos;
440 self.advance(); let mut value = String::new();
443 loop {
444 let remaining = &self.src[self.pos..];
446 if let Some(offset) = memchr(b'\'', remaining) {
447 value.push_str(&String::from_utf8_lossy(
449 &self.src[self.pos..self.pos + offset],
450 ));
451 self.advance_by(offset);
453 self.advance(); if self.peek() == Some(b'\'') {
457 value.push('\'');
458 self.advance();
459 } else {
460 return TokenKind::String(value);
461 }
462 } else {
463 self.pos = self.src.len();
465 return TokenKind::Error(format!(
466 "unterminated string literal starting at byte {}",
467 start
468 ));
469 }
470 }
471 }
472
473 fn lex_double_quoted_id(&mut self) -> TokenKind {
475 let start = self.pos;
476 self.advance(); let mut value = String::new();
479 loop {
480 let remaining = &self.src[self.pos..];
481 if let Some(offset) = memchr(b'"', remaining) {
482 value.push_str(&String::from_utf8_lossy(
483 &self.src[self.pos..self.pos + offset],
484 ));
485 self.advance_by(offset);
486 self.advance(); if self.peek() == Some(b'"') {
490 value.push('"');
491 self.advance();
492 } else {
493 return TokenKind::QuotedId(value, true);
494 }
495 } else {
496 self.pos = self.src.len();
497 return TokenKind::Error(format!(
498 "unterminated double-quoted identifier at byte {}",
499 start
500 ));
501 }
502 }
503 }
504
505 fn lex_backtick_id(&mut self) -> TokenKind {
507 let start = self.pos;
508 self.advance(); let mut value = String::new();
511 loop {
512 let remaining = &self.src[self.pos..];
513 if let Some(offset) = memchr(b'`', remaining) {
514 value.push_str(&String::from_utf8_lossy(
515 &self.src[self.pos..self.pos + offset],
516 ));
517 self.advance_by(offset);
518 self.advance(); if self.peek() == Some(b'`') {
521 value.push('`');
522 self.advance();
523 } else {
524 return TokenKind::QuotedId(value, false);
525 }
526 } else {
527 self.pos = self.src.len();
528 return TokenKind::Error(format!(
529 "unterminated backtick identifier at byte {}",
530 start
531 ));
532 }
533 }
534 }
535
536 fn lex_bracket_id(&mut self) -> TokenKind {
538 let start = self.pos;
539 self.advance(); let mut value = String::new();
542 let remaining = &self.src[self.pos..];
543 if let Some(offset) = memchr(b']', remaining) {
544 value.push_str(&String::from_utf8_lossy(
545 &self.src[self.pos..self.pos + offset],
546 ));
547 self.advance_by(offset);
548 self.advance(); TokenKind::QuotedId(value, false)
550 } else {
551 self.pos = self.src.len();
552 TokenKind::Error(format!("unterminated bracket identifier at byte {}", start))
553 }
554 }
555
556 fn lex_blob(&mut self) -> TokenKind {
558 let start = self.pos;
559 self.advance(); self.advance(); let hex_start = self.pos;
563 let remaining = &self.src[self.pos..];
564 if let Some(offset) = memchr(b'\'', remaining) {
565 let hex_bytes = &self.src[hex_start..hex_start + offset];
566 self.advance_by(offset);
567 self.advance(); if hex_bytes.len() % 2 != 0 {
571 return TokenKind::Error(format!(
572 "blob literal has odd number of hex digits at byte {}",
573 start
574 ));
575 }
576
577 let mut bytes = Vec::with_capacity(hex_bytes.len() / 2);
580 for pair in hex_bytes.chunks_exact(2) {
581 let hi = hex_digit(pair[0]);
582 let lo = hex_digit(pair[1]);
583 match (hi, lo) {
584 (Some(h), Some(l)) => bytes.push((h << 4) | l),
585 _ => {
586 return TokenKind::Error(format!(
587 "invalid hex in blob literal at byte {start}"
588 ));
589 }
590 }
591 }
592 TokenKind::Blob(bytes)
593 } else {
594 self.pos = self.src.len();
595 TokenKind::Error(format!("unterminated blob literal at byte {}", start))
596 }
597 }
598
599 fn lex_number(&mut self) -> TokenKind {
601 let start = self.pos;
602
603 if self.src[self.pos] == b'0' && self.peek_at(1).is_some_and(|c| c == b'x' || c == b'X') {
605 self.advance(); self.advance(); let hex_start = self.pos;
608 while self.pos < self.src.len() && self.src[self.pos].is_ascii_hexdigit() {
609 self.advance();
610 }
611 if self.pos == hex_start {
612 return TokenKind::Error("empty hex literal".to_owned());
613 }
614 let hex_str = String::from_utf8_lossy(&self.src[hex_start..self.pos]);
615 let significant = hex_str.trim_start_matches('0');
619 if significant.len() > 16 {
620 return TokenKind::Error(format!("hex literal out of range at byte {start}"));
621 }
622 let parse_str = if significant.is_empty() {
623 "0"
624 } else {
625 significant
626 };
627 return match u64::from_str_radix(parse_str, 16) {
630 Ok(v) => {
631 #[allow(clippy::cast_possible_wrap)]
632 let i = v as i64;
633 TokenKind::Integer(i)
634 }
635 Err(_) => TokenKind::Error(format!("hex literal out of range at byte {start}")),
636 };
637 }
638
639 let mut is_float = false;
641
642 while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
644 self.advance();
645 }
646
647 let is_valid_exponent = |lexer: &Self, mut offset: usize| -> bool {
649 if let Some(c) = lexer.peek_at(offset) {
650 if c == b'e' || c == b'E' {
651 offset += 1;
652 if let Some(s) = lexer.peek_at(offset) {
653 if s == b'+' || s == b'-' {
654 offset += 1;
655 }
656 }
657 if let Some(d) = lexer.peek_at(offset) {
658 return d.is_ascii_digit();
659 }
660 }
661 }
662 false
663 };
664
665 if self.pos < self.src.len()
667 && self.src[self.pos] == b'.'
668 && (self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) || is_valid_exponent(self, 1))
669 {
670 is_float = true;
671 self.advance(); while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
673 self.advance();
674 }
675 } else if self.pos < self.src.len()
676 && self.src[self.pos] == b'.'
677 && start < self.pos && !self.peek_at(1).is_some_and(|c| c.is_ascii_alphanumeric() || c == b'_')
679 {
680 is_float = true;
682 self.advance(); }
684
685 if self.src[start] == b'.' {
687 is_float = true;
688 }
689
690 if is_valid_exponent(self, 0) {
692 is_float = true;
693 self.advance(); if self.pos < self.src.len()
695 && (self.src[self.pos] == b'+' || self.src[self.pos] == b'-')
696 {
697 self.advance();
698 }
699 while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
700 self.advance();
701 }
702 }
703
704 if let Some(c) = self.peek() {
707 if c.is_ascii_alphabetic()
708 || c == b'_'
709 || (c == b'.'
710 && self
711 .peek_at(1)
712 .is_some_and(|n| n.is_ascii_alphabetic() || n == b'_'))
713 {
714 let err_start = start;
715 while self.pos < self.src.len() {
716 let ch = self.src[self.pos];
717 if ch.is_ascii_alphanumeric() || ch == b'_' || ch == b'.' {
718 self.advance();
719 } else {
720 break;
721 }
722 }
723 let err_text = String::from_utf8_lossy(&self.src[err_start..self.pos]);
724 return TokenKind::Error(format!("unrecognized token: \"{err_text}\""));
725 }
726 }
727
728 let text = String::from_utf8_lossy(&self.src[start..self.pos]);
729 if is_float {
730 match text.parse::<f64>() {
731 Ok(v) => TokenKind::Float(v),
732 Err(_) => {
733 let mut text_fixed = text.clone().into_owned();
735 if text_fixed.starts_with(".e") || text_fixed.starts_with(".E") {
736 text_fixed.insert(0, '0');
737 }
738 match text_fixed.parse::<f64>() {
739 Ok(v) => TokenKind::Float(v),
740 Err(_) => TokenKind::Error(format!("invalid float: {text}")),
741 }
742 }
743 }
744 } else {
745 match text.parse::<i64>() {
746 Ok(v) => TokenKind::Integer(v),
747 Err(_) => {
748 TokenKind::OversizedInt(text.into_owned())
751 }
752 }
753 }
754 }
755
756 fn lex_identifier(&mut self) -> TokenKind {
758 let start = self.pos;
759 self.advance(); while self.pos < self.src.len() {
762 let ch = self.src[self.pos];
763 if ch.is_ascii_alphanumeric() || ch == b'_' || ch >= 0x80 {
764 self.advance();
765 } else {
766 break;
767 }
768 }
769
770 let text = String::from_utf8_lossy(&self.src[start..self.pos]).into_owned();
771
772 if let Some(kw) = TokenKind::lookup_keyword(&text) {
774 kw
775 } else {
776 TokenKind::Id(text)
777 }
778 }
779
780 fn lex_question(&mut self) -> TokenKind {
782 self.advance(); if self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
784 let num_start = self.pos;
785 while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
786 self.advance();
787 }
788 let text = String::from_utf8_lossy(&self.src[num_start..self.pos]);
789 match text.parse::<u32>() {
790 Ok(n) if (1..=MAX_VARIABLE_NUMBER).contains(&n) => TokenKind::QuestionNum(n),
791 Ok(n) => TokenKind::Error(format!(
792 "variable number must be between ?1 and ?{MAX_VARIABLE_NUMBER}, got ?{n}"
793 )),
794 Err(_) => TokenKind::Error("invalid parameter number".to_owned()),
795 }
796 } else {
797 TokenKind::Question
798 }
799 }
800
801 fn lex_alpha_param(&mut self, prefix: char, constructor: fn(String) -> TokenKind) -> TokenKind {
802 self.advance(); let name_start = self.pos;
804 while self.pos < self.src.len() {
805 let ch = self.src[self.pos];
806 if ch.is_ascii_alphanumeric() || ch == b'_' || ch >= 0x80 {
807 self.advance();
808 } else if ch == b':' && self.peek_at(1) == Some(b':') {
809 self.advance();
810 self.advance();
811 } else if ch == b'(' {
812 self.advance();
813 while self.pos < self.src.len() && self.src[self.pos] != b')' {
814 self.advance();
815 }
816 if self.pos >= self.src.len() || self.src[self.pos] != b')' {
817 let name = String::from_utf8_lossy(&self.src[name_start..self.pos]);
818 return TokenKind::Error(format!("unrecognized token: \"{prefix}{name}\""));
819 }
820 self.advance();
821 break; } else {
823 break;
824 }
825 }
826 if self.pos == name_start {
827 return TokenKind::Error(format!("empty parameter name after '{prefix}'"));
828 }
829 let name = String::from_utf8_lossy(&self.src[name_start..self.pos]).into_owned();
830 constructor(name)
831 }
832
833 fn lex_colon_param(&mut self) -> TokenKind {
835 self.lex_alpha_param(':', TokenKind::ColonParam)
836 }
837
838 fn lex_at_param(&mut self) -> TokenKind {
840 self.lex_alpha_param('@', TokenKind::AtParam)
841 }
842
843 fn lex_dollar_param(&mut self) -> TokenKind {
845 self.lex_alpha_param('$', TokenKind::DollarParam)
846 }
847
848 fn lex_minus_or_arrow(&mut self) -> TokenKind {
854 self.advance(); if self.peek() == Some(b'>') {
856 self.advance(); if self.peek() == Some(b'>') {
858 self.advance(); TokenKind::DoubleArrow
860 } else {
861 TokenKind::Arrow
862 }
863 } else {
864 TokenKind::Minus
865 }
866 }
867
868 fn lex_lt(&mut self) -> TokenKind {
870 self.advance(); match self.peek() {
872 Some(b'=') => {
873 self.advance();
874 TokenKind::Le
875 }
876 Some(b'>') => {
877 self.advance();
878 TokenKind::LtGt
879 }
880 Some(b'<') => {
881 self.advance();
882 TokenKind::ShiftLeft
883 }
884 _ => TokenKind::Lt,
885 }
886 }
887
888 fn lex_gt(&mut self) -> TokenKind {
890 self.advance(); match self.peek() {
892 Some(b'=') => {
893 self.advance();
894 TokenKind::Ge
895 }
896 Some(b'>') => {
897 self.advance();
898 TokenKind::ShiftRight
899 }
900 _ => TokenKind::Gt,
901 }
902 }
903
904 fn lex_eq(&mut self) -> TokenKind {
906 self.advance(); if self.peek() == Some(b'=') {
908 self.advance();
909 TokenKind::EqEq
910 } else {
911 TokenKind::Eq
912 }
913 }
914
915 fn lex_bang(&mut self) -> TokenKind {
917 self.advance(); if self.peek() == Some(b'=') {
919 self.advance();
920 TokenKind::Ne
921 } else {
922 TokenKind::Error("unexpected '!', did you mean '!='?".to_owned())
923 }
924 }
925
926 fn lex_pipe(&mut self) -> TokenKind {
928 self.advance(); if self.peek() == Some(b'|') {
930 self.advance();
931 TokenKind::Concat
932 } else {
933 TokenKind::Pipe
934 }
935 }
936}
937
938const fn hex_digit(b: u8) -> Option<u8> {
941 match b {
942 b'0'..=b'9' => Some(b - b'0'),
943 b'a'..=b'f' => Some(b - b'a' + 10),
944 b'A'..=b'F' => Some(b - b'A' + 10),
945 _ => None,
946 }
947}
948
949#[cfg(test)]
950mod tests {
951 use super::*;
952
953 fn lex(src: &str) -> Vec<Token> {
954 Lexer::tokenize(src)
955 }
956
957 fn kinds(src: &str) -> Vec<TokenKind> {
958 lex(src).into_iter().map(|t| t.kind).collect()
959 }
960
961 #[test]
962 fn test_lex_integer_literals() {
963 let tokens = kinds("42 0 0xFF");
964 assert_eq!(
965 tokens,
966 vec![
967 TokenKind::Integer(42),
968 TokenKind::Integer(0),
969 TokenKind::Integer(255),
970 TokenKind::Eof,
971 ]
972 );
973 }
974
975 #[test]
976 fn test_lex_float_literals() {
977 let tokens = kinds("3.14 1e10 .5 1.0e-3 0.0");
978 let expected = 3.0 + 0.14;
981 assert!(matches!(
982 tokens[0],
983 TokenKind::Float(v) if (v - expected).abs() < 1e-10
984 ));
985 assert!(matches!(tokens[1], TokenKind::Float(v) if (v - 1e10).abs() < 1.0));
986 assert!(matches!(tokens[2], TokenKind::Float(v) if (v - 0.5).abs() < 1e-10));
987 assert!(matches!(tokens[3], TokenKind::Float(v) if (v - 0.001).abs() < 1e-10));
988 assert!(matches!(tokens[4], TokenKind::Float(v) if v.abs() < 1e-10));
989 assert_eq!(tokens[5], TokenKind::Eof);
990 }
991
992 #[test]
993 fn test_lex_string_literals() {
994 let tokens = kinds("'hello' 'it''s' ''");
995 assert_eq!(tokens[0], TokenKind::String("hello".to_owned()));
996 assert_eq!(tokens[1], TokenKind::String("it's".to_owned()));
997 assert_eq!(tokens[2], TokenKind::String(String::new()));
998 assert_eq!(tokens[3], TokenKind::Eof);
999 }
1000
1001 #[test]
1002 fn test_lex_blob_literals() {
1003 let tokens = kinds("X'CAFE' x'00ff' X''");
1004 assert_eq!(tokens[0], TokenKind::Blob(vec![0xCA, 0xFE]));
1005 assert_eq!(tokens[1], TokenKind::Blob(vec![0x00, 0xFF]));
1006 assert_eq!(tokens[2], TokenKind::Blob(vec![]));
1007 assert_eq!(tokens[3], TokenKind::Eof);
1008 }
1009
1010 #[test]
1011 fn test_lex_blob_odd_hex_error() {
1012 let tokens = kinds("X'CAF'");
1013 assert!(matches!(tokens[0], TokenKind::Error(_)));
1014 }
1015
1016 #[test]
1017 fn test_lex_blob_non_ascii_no_panic() {
1018 let tokens = kinds("X'U\u{05fc} '");
1021 assert!(matches!(tokens[0], TokenKind::Error(_)));
1022
1023 let tokens2 = kinds("X'GG'");
1025 assert!(matches!(tokens2[0], TokenKind::Error(_)));
1026 }
1027
1028 #[test]
1029 fn test_lex_variables() {
1030 let tokens = kinds("?1 :name @param $var ?");
1031 assert_eq!(tokens[0], TokenKind::QuestionNum(1));
1032 assert_eq!(tokens[1], TokenKind::ColonParam("name".to_owned()));
1033 assert_eq!(tokens[2], TokenKind::AtParam("param".to_owned()));
1034 assert_eq!(tokens[3], TokenKind::DollarParam("var".to_owned()));
1035 assert_eq!(tokens[4], TokenKind::Question);
1036 assert_eq!(tokens[5], TokenKind::Eof);
1037 }
1038
1039 #[test]
1040 fn test_lex_quoted_identifiers() {
1041 let tokens = kinds("\"table_name\" [column] `backtick`");
1042 assert_eq!(
1043 tokens[0],
1044 TokenKind::QuotedId("table_name".to_owned(), true)
1045 );
1046 assert_eq!(tokens[1], TokenKind::QuotedId("column".to_owned(), false));
1047 assert_eq!(tokens[2], TokenKind::QuotedId("backtick".to_owned(), false));
1048 }
1049
1050 #[test]
1051 fn test_lex_dqs_flag() {
1052 let tokens = kinds("\"hello\"");
1053 assert_eq!(tokens[0], TokenKind::QuotedId("hello".to_owned(), true));
1055 }
1056
1057 #[test]
1058 fn test_lex_keywords() {
1059 let tokens = kinds("SELECT FROM WHERE INSERT CREATE TABLE CONCURRENT");
1060 assert_eq!(tokens[0], TokenKind::KwSelect);
1061 assert_eq!(tokens[1], TokenKind::KwFrom);
1062 assert_eq!(tokens[2], TokenKind::KwWhere);
1063 assert_eq!(tokens[3], TokenKind::KwInsert);
1064 assert_eq!(tokens[4], TokenKind::KwCreate);
1065 assert_eq!(tokens[5], TokenKind::KwTable);
1066 assert_eq!(tokens[6], TokenKind::KwConcurrent);
1067
1068 let tokens2 = kinds("select from where");
1070 assert_eq!(tokens2[0], TokenKind::KwSelect);
1071 assert_eq!(tokens2[1], TokenKind::KwFrom);
1072 assert_eq!(tokens2[2], TokenKind::KwWhere);
1073 }
1074
1075 #[test]
1076 fn test_lex_operators() {
1077 let tokens = kinds("+ - * / % & | ~ << >> = < <= > >= == != <> || -> ->>");
1078 let expected = vec![
1079 TokenKind::Plus,
1080 TokenKind::Minus,
1081 TokenKind::Star,
1082 TokenKind::Slash,
1083 TokenKind::Percent,
1084 TokenKind::Ampersand,
1085 TokenKind::Pipe,
1086 TokenKind::Tilde,
1087 TokenKind::ShiftLeft,
1088 TokenKind::ShiftRight,
1089 TokenKind::Eq,
1090 TokenKind::Lt,
1091 TokenKind::Le,
1092 TokenKind::Gt,
1093 TokenKind::Ge,
1094 TokenKind::EqEq,
1095 TokenKind::Ne,
1096 TokenKind::LtGt,
1097 TokenKind::Concat,
1098 TokenKind::Arrow,
1099 TokenKind::DoubleArrow,
1100 TokenKind::Eof,
1101 ];
1102 assert_eq!(tokens, expected);
1103 }
1104
1105 #[test]
1106 fn test_lex_eq_vs_eqeq() {
1107 let tokens = kinds("= ==");
1108 assert_eq!(tokens[0], TokenKind::Eq);
1109 assert_eq!(tokens[1], TokenKind::EqEq);
1110 }
1111
1112 #[test]
1113 fn test_lex_ne_vs_ltgt() {
1114 let tokens = kinds("!= <>");
1115 assert_eq!(tokens[0], TokenKind::Ne);
1116 assert_eq!(tokens[1], TokenKind::LtGt);
1117 }
1118
1119 #[test]
1120 fn test_lex_error_unterminated_string() {
1121 let tokens = kinds("'hello");
1122 assert!(matches!(tokens[0], TokenKind::Error(_)));
1123 }
1124
1125 #[test]
1126 fn test_lex_line_column_tracking() {
1127 let tokens = lex("SELECT\n a,\n b");
1128 assert_eq!(tokens[0].line, 1);
1129 assert_eq!(tokens[0].col, 1);
1130 assert_eq!(tokens[1].line, 2);
1132 assert_eq!(tokens[1].col, 3);
1133 assert_eq!(tokens[2].line, 2);
1135 assert_eq!(tokens[2].col, 4);
1136 assert_eq!(tokens[3].line, 3);
1138 assert_eq!(tokens[3].col, 3);
1139 }
1140
1141 #[test]
1142 fn test_lex_whitespace_and_comments_skipped() {
1143 let tokens = kinds("SELECT -- this is a comment\n a /* block */ FROM b");
1144 assert_eq!(tokens[0], TokenKind::KwSelect);
1145 assert_eq!(tokens[1], TokenKind::Id("a".to_owned()));
1146 assert_eq!(tokens[2], TokenKind::KwFrom);
1147 assert_eq!(tokens[3], TokenKind::Id("b".to_owned()));
1148 assert_eq!(tokens[4], TokenKind::Eof);
1149 }
1150
1151 #[test]
1152 fn test_lex_hex_large_values() {
1153 let tokens = kinds("0xFFFFFFFFFFFFFFFF");
1156 assert_eq!(tokens[0], TokenKind::Integer(-1));
1157
1158 let tokens = kinds("0x8000000000000000");
1160 assert_eq!(tokens[0], TokenKind::Integer(i64::MIN));
1161
1162 let tokens = kinds("0x7FFFFFFFFFFFFFFF");
1164 assert_eq!(tokens[0], TokenKind::Integer(i64::MAX));
1165 }
1166
1167 #[test]
1168 fn test_lex_hex_overflow_17_digits_rejects() {
1169 let tokens = kinds("0x10000000000000000");
1172 assert!(
1173 matches!(&tokens[0], TokenKind::Error(msg) if msg.contains("out of range")),
1174 "expected error for 17-digit hex, got {:?}",
1175 tokens[0]
1176 );
1177 }
1178
1179 #[test]
1180 fn test_lex_hex_leading_zeros_accepted() {
1181 let tokens = kinds("0x00000000000000001");
1184 assert_eq!(tokens[0], TokenKind::Integer(1));
1185 }
1186
1187 #[test]
1188 fn test_lex_number_hex() {
1189 let tokens = kinds("0x1A 0Xff 0x0");
1190 assert_eq!(tokens[0], TokenKind::Integer(26));
1191 assert_eq!(tokens[1], TokenKind::Integer(255));
1192 assert_eq!(tokens[2], TokenKind::Integer(0));
1193 assert_eq!(tokens[3], TokenKind::Eof);
1194 }
1195
1196 #[test]
1197 fn test_lex_number_unrecognized() {
1198 let tokens = kinds("123a 123.a");
1199 assert!(
1200 matches!(tokens[0], TokenKind::Error(ref e) if e.contains("unrecognized token: \"123a\""))
1201 );
1202 assert!(
1203 matches!(tokens[1], TokenKind::Error(ref e) if e.contains("unrecognized token: \"123.a\""))
1204 );
1205 }
1206
1207 #[test]
1208 fn test_lex_number_hex_invalid() {
1209 let tokens = kinds("0x");
1210 assert!(matches!(tokens[0], TokenKind::Error(_)));
1211 }
1212
1213 #[test]
1214 fn test_lex_positional_params() {
1215 let tokens = kinds("? ?123");
1216 assert_eq!(tokens[0], TokenKind::Question);
1217 assert_eq!(tokens[1], TokenKind::QuestionNum(123));
1218 assert_eq!(tokens[2], TokenKind::Eof);
1219 }
1220
1221 #[test]
1222 fn test_lex_positional_params_reject_zero_and_out_of_range() {
1223 let tokens = kinds("?0 ?32767");
1224 assert!(
1225 matches!(tokens[0], TokenKind::Error(ref e) if e.contains("between ?1 and ?32766")),
1226 "expected ?0 to be rejected, got {:?}",
1227 tokens[0]
1228 );
1229 assert!(
1230 matches!(tokens[1], TokenKind::Error(ref e) if e.contains("between ?1 and ?32766")),
1231 "expected ?32767 to be rejected, got {:?}",
1232 tokens[1]
1233 );
1234 assert_eq!(tokens[2], TokenKind::Eof);
1235 }
1236
1237 #[test]
1238 fn test_lex_named_params() {
1239 let tokens = kinds(":foo @bar $baz_123");
1240 assert_eq!(tokens[0], TokenKind::ColonParam("foo".to_owned()));
1241 assert_eq!(tokens[1], TokenKind::AtParam("bar".to_owned()));
1242 assert_eq!(tokens[2], TokenKind::DollarParam("baz_123".to_owned()));
1243 assert_eq!(tokens[3], TokenKind::Eof);
1244 }
1245
1246 #[test]
1247 fn test_lex_named_params_with_tcl_syntax() {
1248 let tokens = kinds("$::foo(bar) :a::b");
1249 assert_eq!(tokens[0], TokenKind::DollarParam("::foo(bar)".to_owned()));
1250 assert_eq!(tokens[1], TokenKind::ColonParam("a::b".to_owned()));
1251 assert_eq!(tokens[2], TokenKind::Eof);
1252 }
1253
1254 #[test]
1255 fn test_lex_named_params_with_unclosed_tcl_array_syntax() {
1256 let tokens = kinds("$::foo(bar");
1257 assert!(
1258 matches!(tokens[0], TokenKind::Error(ref e) if e.contains("unrecognized token")),
1259 "expected unterminated Tcl-style parameter to be rejected, got {:?}",
1260 tokens[0]
1261 );
1262 assert_eq!(tokens[1], TokenKind::Eof);
1263 }
1264
1265 fn histogram_total(hist: &TokenizeDurationSecondsHistogram) -> u64 {
1266 hist.le_100us + hist.le_250us + hist.le_500us + hist.le_1ms + hist.le_5ms + hist.gt_5ms
1267 }
1268
1269 #[test]
1270 fn test_tokenize_metrics_accumulate_tokens_and_histogram_samples() {
1271 reset_tokenize_metrics();
1272
1273 let first = lex("SELECT 1;");
1274 let second = lex("SELECT 2;");
1275
1276 let expected_total_tokens =
1277 u64::try_from(first.len() + second.len()).expect("small token vectors should fit");
1278 let snap = tokenize_metrics_snapshot();
1279 assert_eq!(snap.fsqlite_tokenize_tokens_total, expected_total_tokens);
1280 assert_eq!(snap.fsqlite_tokenize_duration_seconds_count, 2);
1281 assert_eq!(
1282 histogram_total(&snap.fsqlite_tokenize_duration_seconds),
1283 snap.fsqlite_tokenize_duration_seconds_count
1284 );
1285 }
1286
1287 #[test]
1288 fn test_tokenize_metrics_reset_clears_all_fields() {
1289 reset_tokenize_metrics();
1290 let _ = lex("SELECT 42;");
1291
1292 let before = tokenize_metrics_snapshot();
1293 assert!(before.fsqlite_tokenize_tokens_total > 0);
1294 assert!(before.fsqlite_tokenize_duration_seconds_count > 0);
1295
1296 reset_tokenize_metrics();
1297 let after = tokenize_metrics_snapshot();
1298 assert_eq!(after.fsqlite_tokenize_tokens_total, 0);
1299 assert_eq!(after.fsqlite_tokenize_duration_seconds_count, 0);
1300 assert_eq!(after.fsqlite_tokenize_duration_seconds_sum_micros, 0);
1301 assert_eq!(histogram_total(&after.fsqlite_tokenize_duration_seconds), 0);
1302 }
1303}