use fsqlite_ast::Span;
use fsqlite_types::limits::MAX_VARIABLE_NUMBER;
use hashbrown::HashSet;
use memchr::memchr;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::time::Instant;
use tracing::Level;
use crate::token::{Token, TokenKind};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub struct TokenizeDurationSecondsHistogram {
pub le_100us: u64,
pub le_250us: u64,
pub le_500us: u64,
pub le_1ms: u64,
pub le_5ms: u64,
pub gt_5ms: u64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub struct TokenizeMetricsSnapshot {
pub fsqlite_tokenize_tokens_total: u64,
pub fsqlite_tokenize_duration_seconds: TokenizeDurationSecondsHistogram,
pub fsqlite_tokenize_duration_seconds_count: u64,
pub fsqlite_tokenize_duration_seconds_sum_micros: u64,
}
static FSQLITE_TOKENIZE_TOKENS_TOTAL: AtomicU64 = AtomicU64::new(0);
static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US: AtomicU64 = AtomicU64::new(0);
static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US: AtomicU64 = AtomicU64::new(0);
static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US: AtomicU64 = AtomicU64::new(0);
static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS: AtomicU64 = AtomicU64::new(0);
static FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS: AtomicU64 = AtomicU64::new(0);
static FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS: AtomicU64 = AtomicU64::new(0);
static FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT: AtomicU64 = AtomicU64::new(0);
static FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS: AtomicU64 = AtomicU64::new(0);
static FSQLITE_TOKENIZE_METRICS_ENABLED: AtomicBool = AtomicBool::new(false);
fn saturating_u64_from_usize(value: usize) -> u64 {
u64::try_from(value).unwrap_or(u64::MAX)
}
fn saturating_u64_from_u128(value: u128) -> u64 {
u64::try_from(value).unwrap_or(u64::MAX)
}
fn record_tokenize_metrics(token_count: usize, elapsed_micros: u64) {
FSQLITE_TOKENIZE_TOKENS_TOTAL
.fetch_add(saturating_u64_from_usize(token_count), Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT.fetch_add(1, Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS.fetch_add(elapsed_micros, Ordering::Relaxed);
let bucket = match elapsed_micros {
0..=100 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US,
101..=250 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US,
251..=500 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US,
501..=1_000 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS,
1_001..=5_000 => &FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS,
_ => &FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS,
};
bucket.fetch_add(1, Ordering::Relaxed);
}
#[must_use]
pub fn tokenize_metrics_snapshot() -> TokenizeMetricsSnapshot {
TokenizeMetricsSnapshot {
fsqlite_tokenize_tokens_total: FSQLITE_TOKENIZE_TOKENS_TOTAL.load(Ordering::Relaxed),
fsqlite_tokenize_duration_seconds: TokenizeDurationSecondsHistogram {
le_100us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US.load(Ordering::Relaxed),
le_250us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US.load(Ordering::Relaxed),
le_500us: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US.load(Ordering::Relaxed),
le_1ms: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS.load(Ordering::Relaxed),
le_5ms: FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS.load(Ordering::Relaxed),
gt_5ms: FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS.load(Ordering::Relaxed),
},
fsqlite_tokenize_duration_seconds_count: FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT
.load(Ordering::Relaxed),
fsqlite_tokenize_duration_seconds_sum_micros: FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS
.load(Ordering::Relaxed),
}
}
pub fn set_tokenize_metrics_enabled(enabled: bool) {
FSQLITE_TOKENIZE_METRICS_ENABLED.store(enabled, Ordering::Relaxed);
}
#[must_use]
pub fn tokenize_metrics_enabled() -> bool {
FSQLITE_TOKENIZE_METRICS_ENABLED.load(Ordering::Relaxed)
}
pub fn reset_tokenize_metrics() {
FSQLITE_TOKENIZE_TOKENS_TOTAL.store(0, Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_LE_100US.store(0, Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_LE_250US.store(0, Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_LE_500US.store(0, Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_LE_1MS.store(0, Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_LE_5MS.store(0, Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_GT_5MS.store(0, Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_COUNT.store(0, Ordering::Relaxed);
FSQLITE_TOKENIZE_DURATION_SECONDS_SUM_MICROS.store(0, Ordering::Relaxed);
}
const MAX_RETAINED_IDENTIFIER_INTERNER_ENTRIES: usize = 256;
const MAX_RETAINED_IDENTIFIER_INTERNER_BYTES: usize = 16 * 1024;
#[derive(Debug, Default)]
pub(crate) struct IdentifierInterner {
values: HashSet<Arc<str>>,
}
impl IdentifierInterner {
fn intern(&mut self, value: &str) -> Arc<str> {
if let Some(existing) = self.values.get(value) {
return Arc::clone(existing);
}
let interned: Arc<str> = Arc::from(value);
let inserted = Arc::clone(&interned);
self.values.insert(interned);
inserted
}
pub(crate) fn reset(&mut self) {
self.values = HashSet::new();
}
pub(crate) fn retained_bytes(&self) -> usize {
let interned_value_bytes = self
.values
.iter()
.fold(0usize, |sum, value| sum.saturating_add(value.len()));
self.values
.capacity()
.saturating_mul(std::mem::size_of::<Arc<str>>())
.saturating_add(interned_value_bytes)
}
pub(crate) fn prepare_for_next_parse(&mut self) {
if self.values.len() > MAX_RETAINED_IDENTIFIER_INTERNER_ENTRIES
|| self.retained_bytes() > MAX_RETAINED_IDENTIFIER_INTERNER_BYTES
{
self.reset();
}
}
#[cfg(test)]
pub(crate) fn is_empty(&self) -> bool {
self.values.is_empty()
}
#[cfg(test)]
pub(crate) fn len(&self) -> usize {
self.values.len()
}
}
pub struct Lexer<'a> {
src: &'a [u8],
pos: usize,
line: u32,
col: u32,
trace_chars: bool,
interner: IdentifierInterner,
}
impl<'a> Lexer<'a> {
fn log_token(token: &Token) {
tracing::debug!(
target: "fsqlite.parse",
token = ?token.kind,
start = token.span.start,
end = token.span.end,
line = token.line,
col = token.col,
"tokenized token"
);
}
#[must_use]
pub fn new(source: &'a str) -> Self {
Self {
src: source.as_bytes(),
pos: 0,
line: 1,
col: 1,
trace_chars: tracing::enabled!(target: "fsqlite.parse", Level::TRACE),
interner: IdentifierInterner::default(),
}
}
#[must_use]
pub fn tokenize(source: &'a str) -> Vec<Token> {
let mut tokens = Vec::new();
Self::tokenize_into(source, &mut tokens);
tokens
}
fn new_with_interner(source: &'a str, interner: IdentifierInterner) -> Self {
Self {
src: source.as_bytes(),
pos: 0,
line: 1,
col: 1,
trace_chars: tracing::enabled!(target: "fsqlite.parse", Level::TRACE),
interner,
}
}
pub fn tokenize_into(source: &'a str, tokens: &mut Vec<Token>) {
let mut interner = IdentifierInterner::default();
Self::tokenize_into_with_interner(source, tokens, &mut interner);
}
pub(crate) fn tokenize_into_with_interner(
source: &'a str,
tokens: &mut Vec<Token>,
interner: &mut IdentifierInterner,
) {
let input_bytes = source.len();
let collect_tokenize_metrics = tokenize_metrics_enabled();
let trace_tokenize = tracing::enabled!(target: "fsqlite.parse", Level::TRACE);
let span = trace_tokenize.then(|| {
tracing::span!(
target: "fsqlite.parse",
Level::TRACE,
"tokenize",
token_count = tracing::field::Empty,
input_bytes,
elapsed_us = tracing::field::Empty,
)
});
let _guard = span.as_ref().map(|span| span.enter());
let started = (collect_tokenize_metrics || trace_tokenize).then(Instant::now);
let mut lexer = Self::new_with_interner(source, std::mem::take(interner));
let target_capacity = input_bytes / 4 + 1;
tokens.clear();
if target_capacity > tokens.capacity() {
tokens.reserve(target_capacity - tokens.capacity());
}
loop {
let tok = lexer.next_token();
let is_eof = tok.kind == TokenKind::Eof;
tokens.push(tok);
if is_eof {
break;
}
}
*interner = lexer.interner;
if let Some(started) = started {
let elapsed_us = saturating_u64_from_u128(started.elapsed().as_micros());
if let Some(span) = span.as_ref() {
span.record("token_count", saturating_u64_from_usize(tokens.len()));
span.record("elapsed_us", elapsed_us);
}
if collect_tokenize_metrics {
record_tokenize_metrics(tokens.len(), elapsed_us);
}
}
}
#[must_use]
pub fn metrics_snapshot() -> TokenizeMetricsSnapshot {
tokenize_metrics_snapshot()
}
pub fn reset_metrics() {
reset_tokenize_metrics();
}
pub fn next_token(&mut self) -> Token {
self.skip_whitespace_and_comments();
if self.pos >= self.src.len() {
let token = self.make_token(TokenKind::Eof, self.pos, self.pos);
Self::log_token(&token);
return token;
}
let start = self.pos;
let start_line = self.line;
let start_col = self.col;
let ch = self.src[self.pos];
let kind = match ch {
b'\'' => self.lex_string(),
b'"' => self.lex_double_quoted_id(),
b'`' => self.lex_backtick_id(),
b'[' => self.lex_bracket_id(),
b'X' | b'x' if self.peek_at(1) == Some(b'\'') => self.lex_blob(),
b'0'..=b'9' => self.lex_number(),
b'.' if self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) => self.lex_number(),
b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF => self.lex_identifier(),
b'?' => self.lex_question(),
b':' => self.lex_colon_param(),
b'@' => self.lex_at_param(),
b'$' => self.lex_dollar_param(),
b'+' => {
self.advance();
TokenKind::Plus
}
b'*' => {
self.advance();
TokenKind::Star
}
b'/' => {
self.advance();
TokenKind::Slash
}
b'%' => {
self.advance();
TokenKind::Percent
}
b'&' => {
self.advance();
TokenKind::Ampersand
}
b'~' => {
self.advance();
TokenKind::Tilde
}
b',' => {
self.advance();
TokenKind::Comma
}
b';' => {
self.advance();
TokenKind::Semicolon
}
b'(' => {
self.advance();
TokenKind::LeftParen
}
b')' => {
self.advance();
TokenKind::RightParen
}
b'.' => {
self.advance();
TokenKind::Dot
}
b'-' => self.lex_minus_or_arrow(),
b'<' => self.lex_lt(),
b'>' => self.lex_gt(),
b'=' => self.lex_eq(),
b'!' => self.lex_bang(),
b'|' => self.lex_pipe(),
_ => {
self.advance();
let s = String::from_utf8_lossy(&self.src[start..self.pos]).into_owned();
TokenKind::Error(format!("unexpected character: {s}"))
}
};
let token = Token {
kind,
#[allow(clippy::cast_possible_truncation)]
span: Span::new(start as u32, self.pos as u32),
line: start_line,
col: start_col,
};
Self::log_token(&token);
token
}
#[allow(clippy::cast_possible_truncation)]
fn advance_by(&mut self, n: usize) {
if n == 0 {
return;
}
let end = self.pos + n;
let slice = &self.src[self.pos..end];
#[allow(clippy::naive_bytecount)]
let newlines = slice.iter().filter(|&&b| b == b'\n').count();
if newlines > 0 {
self.line += newlines as u32;
let last_nl = slice.iter().rposition(|&b| b == b'\n').unwrap_or(0);
self.col = (n - last_nl) as u32;
} else {
self.col += n as u32;
}
self.pos = end;
}
fn advance(&mut self) -> u8 {
let pos = self.pos;
let line = self.line;
let col = self.col;
let ch = self.src[self.pos];
self.pos += 1;
if ch == b'\n' {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
if self.trace_chars {
tracing::trace!(
target: "fsqlite.parse",
byte = ch,
pos,
line,
col,
"tokenize char"
);
}
ch
}
fn peek(&self) -> Option<u8> {
self.src.get(self.pos).copied()
}
fn peek_at(&self, offset: usize) -> Option<u8> {
self.src.get(self.pos + offset).copied()
}
#[allow(clippy::cast_possible_truncation)]
fn make_token(&self, kind: TokenKind, start: usize, end: usize) -> Token {
Token {
kind,
span: Span::new(start as u32, end as u32),
line: self.line,
col: self.col,
}
}
fn skip_whitespace_and_comments(&mut self) {
loop {
let mut ws_len = 0;
while self.pos + ws_len < self.src.len()
&& self.src[self.pos + ws_len].is_ascii_whitespace()
{
ws_len += 1;
}
if ws_len > 0 {
self.advance_by(ws_len);
}
if self.pos >= self.src.len() {
break;
}
if self.src[self.pos] == b'-' && self.peek_at(1) == Some(b'-') {
self.advance(); self.advance(); while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
self.advance();
}
continue;
}
if self.src[self.pos] == b'/' && self.peek_at(1) == Some(b'*') {
self.advance(); self.advance(); let closed = loop {
if self.pos >= self.src.len() {
break false;
}
if self.src[self.pos] == b'*' && self.peek_at(1) == Some(b'/') {
self.advance();
self.advance();
break true;
}
self.advance();
};
if !closed {
self.pos = self.src.len();
}
continue;
}
break;
}
}
fn lex_string(&mut self) -> TokenKind {
let start = self.pos;
self.advance();
let mut value = String::new();
loop {
let remaining = &self.src[self.pos..];
if let Some(offset) = memchr(b'\'', remaining) {
value.push_str(&String::from_utf8_lossy(
&self.src[self.pos..self.pos + offset],
));
self.advance_by(offset);
self.advance();
if self.peek() == Some(b'\'') {
value.push('\'');
self.advance();
} else {
return TokenKind::String(value);
}
} else {
self.pos = self.src.len();
return TokenKind::Error(format!(
"unterminated string literal starting at byte {}",
start
));
}
}
}
fn lex_double_quoted_id(&mut self) -> TokenKind {
let start = self.pos;
self.advance();
let mut value = String::new();
loop {
let remaining = &self.src[self.pos..];
if let Some(offset) = memchr(b'"', remaining) {
value.push_str(&String::from_utf8_lossy(
&self.src[self.pos..self.pos + offset],
));
self.advance_by(offset);
self.advance();
if self.peek() == Some(b'"') {
value.push('"');
self.advance();
} else {
return TokenKind::QuotedId(self.interner.intern(&value), true);
}
} else {
self.pos = self.src.len();
return TokenKind::Error(format!(
"unterminated double-quoted identifier at byte {}",
start
));
}
}
}
fn lex_backtick_id(&mut self) -> TokenKind {
let start = self.pos;
self.advance();
let mut value = String::new();
loop {
let remaining = &self.src[self.pos..];
if let Some(offset) = memchr(b'`', remaining) {
value.push_str(&String::from_utf8_lossy(
&self.src[self.pos..self.pos + offset],
));
self.advance_by(offset);
self.advance();
if self.peek() == Some(b'`') {
value.push('`');
self.advance();
} else {
return TokenKind::QuotedId(self.interner.intern(&value), false);
}
} else {
self.pos = self.src.len();
return TokenKind::Error(format!(
"unterminated backtick identifier at byte {}",
start
));
}
}
}
fn lex_bracket_id(&mut self) -> TokenKind {
let start = self.pos;
self.advance();
let mut value = String::new();
let remaining = &self.src[self.pos..];
if let Some(offset) = memchr(b']', remaining) {
value.push_str(&String::from_utf8_lossy(
&self.src[self.pos..self.pos + offset],
));
self.advance_by(offset);
self.advance(); TokenKind::QuotedId(self.interner.intern(&value), false)
} else {
self.pos = self.src.len();
TokenKind::Error(format!("unterminated bracket identifier at byte {}", start))
}
}
fn lex_blob(&mut self) -> TokenKind {
let start = self.pos;
self.advance(); self.advance();
let hex_start = self.pos;
let remaining = &self.src[self.pos..];
if let Some(offset) = memchr(b'\'', remaining) {
let hex_bytes = &self.src[hex_start..hex_start + offset];
self.advance_by(offset);
self.advance();
if hex_bytes.len() % 2 != 0 {
return TokenKind::Error(format!(
"blob literal has odd number of hex digits at byte {}",
start
));
}
let mut bytes = Vec::with_capacity(hex_bytes.len() / 2);
for pair in hex_bytes.chunks_exact(2) {
let hi = hex_digit(pair[0]);
let lo = hex_digit(pair[1]);
match (hi, lo) {
(Some(h), Some(l)) => bytes.push((h << 4) | l),
_ => {
return TokenKind::Error(format!(
"invalid hex in blob literal at byte {start}"
));
}
}
}
TokenKind::Blob(bytes)
} else {
self.pos = self.src.len();
TokenKind::Error(format!("unterminated blob literal at byte {}", start))
}
}
fn lex_number(&mut self) -> TokenKind {
let start = self.pos;
if self.src[self.pos] == b'0' && self.peek_at(1).is_some_and(|c| c == b'x' || c == b'X') {
self.advance(); self.advance(); let hex_start = self.pos;
while self.pos < self.src.len() && self.src[self.pos].is_ascii_hexdigit() {
self.advance();
}
if self.pos == hex_start {
return TokenKind::Error("empty hex literal".to_owned());
}
let hex_str = String::from_utf8_lossy(&self.src[hex_start..self.pos]);
let significant = hex_str.trim_start_matches('0');
if significant.len() > 16 {
return TokenKind::Error(format!("hex literal out of range at byte {start}"));
}
let parse_str = if significant.is_empty() {
"0"
} else {
significant
};
return match u64::from_str_radix(parse_str, 16) {
Ok(v) => {
#[allow(clippy::cast_possible_wrap)]
let i = v as i64;
TokenKind::Integer(i)
}
Err(_) => TokenKind::Error(format!("hex literal out of range at byte {start}")),
};
}
let mut is_float = false;
while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
self.advance();
}
let is_valid_exponent = |lexer: &Self, mut offset: usize| -> bool {
if let Some(c) = lexer.peek_at(offset) {
if c == b'e' || c == b'E' {
offset += 1;
if let Some(s) = lexer.peek_at(offset) {
if s == b'+' || s == b'-' {
offset += 1;
}
}
if let Some(d) = lexer.peek_at(offset) {
return d.is_ascii_digit();
}
}
}
false
};
if self.pos < self.src.len()
&& self.src[self.pos] == b'.'
&& (self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) || is_valid_exponent(self, 1))
{
is_float = true;
self.advance(); while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
self.advance();
}
} else if self.pos < self.src.len()
&& self.src[self.pos] == b'.'
&& start < self.pos && !self.peek_at(1).is_some_and(|c| c.is_ascii_alphanumeric() || c == b'_')
{
is_float = true;
self.advance(); }
if self.src[start] == b'.' {
is_float = true;
}
if is_valid_exponent(self, 0) {
is_float = true;
self.advance(); if self.pos < self.src.len()
&& (self.src[self.pos] == b'+' || self.src[self.pos] == b'-')
{
self.advance();
}
while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
self.advance();
}
}
if let Some(c) = self.peek() {
if c.is_ascii_alphabetic()
|| c == b'_'
|| (c == b'.'
&& self
.peek_at(1)
.is_some_and(|n| n.is_ascii_alphabetic() || n == b'_'))
{
let err_start = start;
while self.pos < self.src.len() {
let ch = self.src[self.pos];
if ch.is_ascii_alphanumeric() || ch == b'_' || ch == b'.' {
self.advance();
} else {
break;
}
}
let err_text = String::from_utf8_lossy(&self.src[err_start..self.pos]);
return TokenKind::Error(format!("unrecognized token: \"{err_text}\""));
}
}
let text = String::from_utf8_lossy(&self.src[start..self.pos]);
if is_float {
let clamp = |v: f64| -> f64 { if v.is_finite() { v } else { f64::MAX } };
match text.parse::<f64>() {
Ok(v) => TokenKind::Float(clamp(v)),
Err(_) => {
let mut text_fixed = text.clone().into_owned();
if text_fixed.starts_with(".e") || text_fixed.starts_with(".E") {
text_fixed.insert(0, '0');
}
match text_fixed.parse::<f64>() {
Ok(v) => TokenKind::Float(clamp(v)),
Err(_) => TokenKind::Error(format!("invalid float: {text}")),
}
}
}
} else {
match text.parse::<i64>() {
Ok(v) => TokenKind::Integer(v),
Err(_) => {
TokenKind::OversizedInt(text.into_owned())
}
}
}
}
fn lex_identifier(&mut self) -> TokenKind {
let start = self.pos;
self.advance();
while self.pos < self.src.len() {
let ch = self.src[self.pos];
if ch.is_ascii_alphanumeric() || ch == b'_' || ch >= 0x80 {
self.advance();
} else {
break;
}
}
let ident_bytes = &self.src[start..self.pos];
if let Some(kw) = TokenKind::lookup_keyword_bytes(ident_bytes) {
kw
} else {
let text = String::from_utf8_lossy(ident_bytes);
TokenKind::Id(self.interner.intern(&text))
}
}
fn lex_question(&mut self) -> TokenKind {
self.advance(); if self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
let num_start = self.pos;
while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
self.advance();
}
let text = String::from_utf8_lossy(&self.src[num_start..self.pos]);
match text.parse::<u32>() {
Ok(n) if (1..=MAX_VARIABLE_NUMBER).contains(&n) => TokenKind::QuestionNum(n),
Ok(n) => TokenKind::Error(format!(
"variable number must be between ?1 and ?{MAX_VARIABLE_NUMBER}, got ?{n}"
)),
Err(_) => TokenKind::Error("invalid parameter number".to_owned()),
}
} else {
TokenKind::Question
}
}
fn lex_alpha_param(&mut self, prefix: char, constructor: fn(String) -> TokenKind) -> TokenKind {
self.advance(); let name_start = self.pos;
while self.pos < self.src.len() {
let ch = self.src[self.pos];
if ch.is_ascii_alphanumeric() || ch == b'_' || ch >= 0x80 {
self.advance();
} else if ch == b':' && self.peek_at(1) == Some(b':') {
self.advance();
self.advance();
} else if ch == b'(' {
self.advance();
while self.pos < self.src.len() && self.src[self.pos] != b')' {
self.advance();
}
if self.pos >= self.src.len() || self.src[self.pos] != b')' {
let name = String::from_utf8_lossy(&self.src[name_start..self.pos]);
return TokenKind::Error(format!("unrecognized token: \"{prefix}{name}\""));
}
self.advance();
break; } else {
break;
}
}
if self.pos == name_start {
return TokenKind::Error(format!("empty parameter name after '{prefix}'"));
}
let name = String::from_utf8_lossy(&self.src[name_start..self.pos]).into_owned();
constructor(name)
}
fn lex_colon_param(&mut self) -> TokenKind {
self.lex_alpha_param(':', TokenKind::ColonParam)
}
fn lex_at_param(&mut self) -> TokenKind {
self.lex_alpha_param('@', TokenKind::AtParam)
}
fn lex_dollar_param(&mut self) -> TokenKind {
self.lex_alpha_param('$', TokenKind::DollarParam)
}
fn lex_minus_or_arrow(&mut self) -> TokenKind {
self.advance(); if self.peek() == Some(b'>') {
self.advance(); if self.peek() == Some(b'>') {
self.advance(); TokenKind::DoubleArrow
} else {
TokenKind::Arrow
}
} else {
TokenKind::Minus
}
}
fn lex_lt(&mut self) -> TokenKind {
self.advance(); match self.peek() {
Some(b'=') => {
self.advance();
TokenKind::Le
}
Some(b'>') => {
self.advance();
TokenKind::LtGt
}
Some(b'<') => {
self.advance();
TokenKind::ShiftLeft
}
_ => TokenKind::Lt,
}
}
fn lex_gt(&mut self) -> TokenKind {
self.advance(); match self.peek() {
Some(b'=') => {
self.advance();
TokenKind::Ge
}
Some(b'>') => {
self.advance();
TokenKind::ShiftRight
}
_ => TokenKind::Gt,
}
}
fn lex_eq(&mut self) -> TokenKind {
self.advance(); if self.peek() == Some(b'=') {
self.advance();
TokenKind::EqEq
} else {
TokenKind::Eq
}
}
fn lex_bang(&mut self) -> TokenKind {
self.advance(); if self.peek() == Some(b'=') {
self.advance();
TokenKind::Ne
} else {
TokenKind::Error("unexpected '!', did you mean '!='?".to_owned())
}
}
fn lex_pipe(&mut self) -> TokenKind {
self.advance(); if self.peek() == Some(b'|') {
self.advance();
TokenKind::Concat
} else {
TokenKind::Pipe
}
}
}
const fn hex_digit(b: u8) -> Option<u8> {
match b {
b'0'..=b'9' => Some(b - b'0'),
b'a'..=b'f' => Some(b - b'a' + 10),
b'A'..=b'F' => Some(b - b'A' + 10),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn lex(src: &str) -> Vec<Token> {
Lexer::tokenize(src)
}
fn kinds(src: &str) -> Vec<TokenKind> {
lex(src).into_iter().map(|t| t.kind).collect()
}
#[test]
fn test_lex_integer_literals() {
let tokens = kinds("42 0 0xFF");
assert_eq!(
tokens,
vec![
TokenKind::Integer(42),
TokenKind::Integer(0),
TokenKind::Integer(255),
TokenKind::Eof,
]
);
}
#[test]
fn test_tokenize_into_reuses_caller_owned_capacity() {
let mut scratch = Vec::new();
Lexer::tokenize_into(
"SELECT 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz';",
&mut scratch,
);
let warmed_capacity = scratch.capacity();
assert!(
warmed_capacity > 0,
"warm parse should allocate token scratch"
);
Lexer::tokenize_into("SELECT 1;", &mut scratch);
assert_eq!(
scratch.capacity(),
warmed_capacity,
"smaller follow-up parse should reuse the warmed token buffer",
);
assert_eq!(
scratch.last().map(|token| &token.kind),
Some(&TokenKind::Eof),
"tokenize_into should still terminate with EOF in reused scratch",
);
}
#[test]
fn test_lex_float_literals() {
let tokens = kinds("3.14 1e10 .5 1.0e-3 0.0");
let expected = 3.0 + 0.14;
assert!(matches!(
tokens[0],
TokenKind::Float(v) if (v - expected).abs() < 1e-10
));
assert!(matches!(tokens[1], TokenKind::Float(v) if (v - 1e10).abs() < 1.0));
assert!(matches!(tokens[2], TokenKind::Float(v) if (v - 0.5).abs() < 1e-10));
assert!(matches!(tokens[3], TokenKind::Float(v) if (v - 0.001).abs() < 1e-10));
assert!(matches!(tokens[4], TokenKind::Float(v) if v.abs() < 1e-10));
assert_eq!(tokens[5], TokenKind::Eof);
}
#[test]
fn test_lex_string_literals() {
let tokens = kinds("'hello' 'it''s' ''");
assert_eq!(tokens[0], TokenKind::String("hello".to_owned()));
assert_eq!(tokens[1], TokenKind::String("it's".to_owned()));
assert_eq!(tokens[2], TokenKind::String(String::new()));
assert_eq!(tokens[3], TokenKind::Eof);
}
#[test]
fn test_lex_blob_literals() {
let tokens = kinds("X'CAFE' x'00ff' X''");
assert_eq!(tokens[0], TokenKind::Blob(vec![0xCA, 0xFE]));
assert_eq!(tokens[1], TokenKind::Blob(vec![0x00, 0xFF]));
assert_eq!(tokens[2], TokenKind::Blob(vec![]));
assert_eq!(tokens[3], TokenKind::Eof);
}
#[test]
fn test_lex_blob_odd_hex_error() {
let tokens = kinds("X'CAF'");
assert!(matches!(tokens[0], TokenKind::Error(_)));
}
#[test]
fn test_lex_blob_non_ascii_no_panic() {
let tokens = kinds("X'U\u{05fc} '");
assert!(matches!(tokens[0], TokenKind::Error(_)));
let tokens2 = kinds("X'GG'");
assert!(matches!(tokens2[0], TokenKind::Error(_)));
}
#[test]
fn test_lex_variables() {
let tokens = kinds("?1 :name @param $var ?");
assert_eq!(tokens[0], TokenKind::QuestionNum(1));
assert_eq!(tokens[1], TokenKind::ColonParam("name".to_owned()));
assert_eq!(tokens[2], TokenKind::AtParam("param".to_owned()));
assert_eq!(tokens[3], TokenKind::DollarParam("var".to_owned()));
assert_eq!(tokens[4], TokenKind::Question);
assert_eq!(tokens[5], TokenKind::Eof);
}
#[test]
fn test_lex_quoted_identifiers() {
let tokens = kinds("\"table_name\" [column] `backtick`");
assert_eq!(tokens[0], TokenKind::QuotedId("table_name".into(), true));
assert_eq!(tokens[1], TokenKind::QuotedId("column".into(), false));
assert_eq!(tokens[2], TokenKind::QuotedId("backtick".into(), false));
}
#[test]
fn test_lex_dqs_flag() {
let tokens = kinds("\"hello\"");
assert_eq!(tokens[0], TokenKind::QuotedId("hello".into(), true));
}
#[test]
fn test_lex_keywords() {
let tokens = kinds("SELECT FROM WHERE INSERT CREATE TABLE CONCURRENT");
assert_eq!(tokens[0], TokenKind::KwSelect);
assert_eq!(tokens[1], TokenKind::KwFrom);
assert_eq!(tokens[2], TokenKind::KwWhere);
assert_eq!(tokens[3], TokenKind::KwInsert);
assert_eq!(tokens[4], TokenKind::KwCreate);
assert_eq!(tokens[5], TokenKind::KwTable);
assert_eq!(tokens[6], TokenKind::KwConcurrent);
let tokens2 = kinds("select from where");
assert_eq!(tokens2[0], TokenKind::KwSelect);
assert_eq!(tokens2[1], TokenKind::KwFrom);
assert_eq!(tokens2[2], TokenKind::KwWhere);
}
#[test]
fn test_lex_operators() {
let tokens = kinds("+ - * / % & | ~ << >> = < <= > >= == != <> || -> ->>");
let expected = vec![
TokenKind::Plus,
TokenKind::Minus,
TokenKind::Star,
TokenKind::Slash,
TokenKind::Percent,
TokenKind::Ampersand,
TokenKind::Pipe,
TokenKind::Tilde,
TokenKind::ShiftLeft,
TokenKind::ShiftRight,
TokenKind::Eq,
TokenKind::Lt,
TokenKind::Le,
TokenKind::Gt,
TokenKind::Ge,
TokenKind::EqEq,
TokenKind::Ne,
TokenKind::LtGt,
TokenKind::Concat,
TokenKind::Arrow,
TokenKind::DoubleArrow,
TokenKind::Eof,
];
assert_eq!(tokens, expected);
}
#[test]
fn test_lex_eq_vs_eqeq() {
let tokens = kinds("= ==");
assert_eq!(tokens[0], TokenKind::Eq);
assert_eq!(tokens[1], TokenKind::EqEq);
}
#[test]
fn test_lex_ne_vs_ltgt() {
let tokens = kinds("!= <>");
assert_eq!(tokens[0], TokenKind::Ne);
assert_eq!(tokens[1], TokenKind::LtGt);
}
#[test]
fn test_lex_error_unterminated_string() {
let tokens = kinds("'hello");
assert!(matches!(tokens[0], TokenKind::Error(_)));
}
#[test]
fn test_lex_line_column_tracking() {
let tokens = lex("SELECT\n a,\n b");
assert_eq!(tokens[0].line, 1);
assert_eq!(tokens[0].col, 1);
assert_eq!(tokens[1].line, 2);
assert_eq!(tokens[1].col, 3);
assert_eq!(tokens[2].line, 2);
assert_eq!(tokens[2].col, 4);
assert_eq!(tokens[3].line, 3);
assert_eq!(tokens[3].col, 3);
}
#[test]
fn test_lex_whitespace_and_comments_skipped() {
let tokens = kinds("SELECT -- this is a comment\n a /* block */ FROM b");
assert_eq!(tokens[0], TokenKind::KwSelect);
assert_eq!(tokens[1], TokenKind::Id("a".into()));
assert_eq!(tokens[2], TokenKind::KwFrom);
assert_eq!(tokens[3], TokenKind::Id("b".into()));
assert_eq!(tokens[4], TokenKind::Eof);
}
#[test]
fn test_lex_hex_large_values() {
let tokens = kinds("0xFFFFFFFFFFFFFFFF");
assert_eq!(tokens[0], TokenKind::Integer(-1));
let tokens = kinds("0x8000000000000000");
assert_eq!(tokens[0], TokenKind::Integer(i64::MIN));
let tokens = kinds("0x7FFFFFFFFFFFFFFF");
assert_eq!(tokens[0], TokenKind::Integer(i64::MAX));
}
#[test]
fn test_lex_hex_overflow_17_digits_rejects() {
let tokens = kinds("0x10000000000000000");
assert!(
matches!(&tokens[0], TokenKind::Error(msg) if msg.contains("out of range")),
"expected error for 17-digit hex, got {:?}",
tokens[0]
);
}
#[test]
fn test_lex_hex_leading_zeros_accepted() {
let tokens = kinds("0x00000000000000001");
assert_eq!(tokens[0], TokenKind::Integer(1));
}
#[test]
fn test_lex_number_hex() {
let tokens = kinds("0x1A 0Xff 0x0");
assert_eq!(tokens[0], TokenKind::Integer(26));
assert_eq!(tokens[1], TokenKind::Integer(255));
assert_eq!(tokens[2], TokenKind::Integer(0));
assert_eq!(tokens[3], TokenKind::Eof);
}
#[test]
fn test_lex_number_unrecognized() {
let tokens = kinds("123a 123.a");
assert!(
matches!(tokens[0], TokenKind::Error(ref e) if e.contains("unrecognized token: \"123a\""))
);
assert!(
matches!(tokens[1], TokenKind::Error(ref e) if e.contains("unrecognized token: \"123.a\""))
);
}
#[test]
fn test_lex_number_hex_invalid() {
let tokens = kinds("0x");
assert!(matches!(tokens[0], TokenKind::Error(_)));
}
#[test]
fn test_lex_positional_params() {
let tokens = kinds("? ?123");
assert_eq!(tokens[0], TokenKind::Question);
assert_eq!(tokens[1], TokenKind::QuestionNum(123));
assert_eq!(tokens[2], TokenKind::Eof);
}
#[test]
fn test_lex_positional_params_reject_zero_and_out_of_range() {
let tokens = kinds("?0 ?32767");
assert!(
matches!(tokens[0], TokenKind::Error(ref e) if e.contains("between ?1 and ?32766")),
"expected ?0 to be rejected, got {:?}",
tokens[0]
);
assert!(
matches!(tokens[1], TokenKind::Error(ref e) if e.contains("between ?1 and ?32766")),
"expected ?32767 to be rejected, got {:?}",
tokens[1]
);
assert_eq!(tokens[2], TokenKind::Eof);
}
#[test]
fn test_lex_named_params() {
let tokens = kinds(":foo @bar $baz_123");
assert_eq!(tokens[0], TokenKind::ColonParam("foo".to_owned()));
assert_eq!(tokens[1], TokenKind::AtParam("bar".to_owned()));
assert_eq!(tokens[2], TokenKind::DollarParam("baz_123".to_owned()));
assert_eq!(tokens[3], TokenKind::Eof);
}
#[test]
fn test_lex_named_params_with_tcl_syntax() {
let tokens = kinds("$::foo(bar) :a::b");
assert_eq!(tokens[0], TokenKind::DollarParam("::foo(bar)".to_owned()));
assert_eq!(tokens[1], TokenKind::ColonParam("a::b".to_owned()));
assert_eq!(tokens[2], TokenKind::Eof);
}
#[test]
fn test_lex_named_params_with_unclosed_tcl_array_syntax() {
let tokens = kinds("$::foo(bar");
assert!(
matches!(tokens[0], TokenKind::Error(ref e) if e.contains("unrecognized token")),
"expected unterminated Tcl-style parameter to be rejected, got {:?}",
tokens[0]
);
assert_eq!(tokens[1], TokenKind::Eof);
}
fn histogram_total(hist: &TokenizeDurationSecondsHistogram) -> u64 {
hist.le_100us + hist.le_250us + hist.le_500us + hist.le_1ms + hist.le_5ms + hist.gt_5ms
}
#[test]
fn test_tokenize_metrics_accumulate_tokens_and_histogram_samples() {
let prev_metrics_enabled = tokenize_metrics_enabled();
reset_tokenize_metrics();
set_tokenize_metrics_enabled(true);
let first = lex("SELECT 1;");
let second = lex("SELECT 2;");
let expected_total_tokens = u64::try_from(first.len() + second.len()).unwrap_or(u64::MAX);
let snap = tokenize_metrics_snapshot();
assert_eq!(snap.fsqlite_tokenize_tokens_total, expected_total_tokens);
assert_eq!(snap.fsqlite_tokenize_duration_seconds_count, 2);
assert_eq!(
histogram_total(&snap.fsqlite_tokenize_duration_seconds),
snap.fsqlite_tokenize_duration_seconds_count
);
set_tokenize_metrics_enabled(prev_metrics_enabled);
reset_tokenize_metrics();
}
#[test]
fn test_tokenize_metrics_reset_clears_all_fields() {
let prev_metrics_enabled = tokenize_metrics_enabled();
reset_tokenize_metrics();
set_tokenize_metrics_enabled(true);
let _ = lex("SELECT 42;");
let before = tokenize_metrics_snapshot();
assert!(before.fsqlite_tokenize_tokens_total > 0);
assert!(before.fsqlite_tokenize_duration_seconds_count > 0);
reset_tokenize_metrics();
let after = tokenize_metrics_snapshot();
assert_eq!(after.fsqlite_tokenize_tokens_total, 0);
assert_eq!(after.fsqlite_tokenize_duration_seconds_count, 0);
assert_eq!(after.fsqlite_tokenize_duration_seconds_sum_micros, 0);
assert_eq!(histogram_total(&after.fsqlite_tokenize_duration_seconds), 0);
set_tokenize_metrics_enabled(prev_metrics_enabled);
}
#[test]
fn test_tokenize_metrics_can_be_disabled_off_hot_path() {
let prev_metrics_enabled = tokenize_metrics_enabled();
reset_tokenize_metrics();
set_tokenize_metrics_enabled(false);
let _ = lex("SELECT 99;");
let snap = tokenize_metrics_snapshot();
assert_eq!(snap.fsqlite_tokenize_tokens_total, 0);
assert_eq!(snap.fsqlite_tokenize_duration_seconds_count, 0);
assert_eq!(snap.fsqlite_tokenize_duration_seconds_sum_micros, 0);
assert_eq!(histogram_total(&snap.fsqlite_tokenize_duration_seconds), 0);
set_tokenize_metrics_enabled(prev_metrics_enabled);
reset_tokenize_metrics();
}
}