syntaqlite_syntax/tokenizer.rs
1// Copyright 2025 The syntaqlite Authors. All rights reserved.
2// Licensed under the Apache License, Version 2.0.
3
4use std::cell::RefCell;
5use std::ffi::CStr;
6use std::marker::PhantomData;
7use std::ptr::NonNull;
8use std::rc::Rc;
9
10use crate::any::AnyTokenType;
11use crate::ast::GrammarTokenType;
12use crate::grammar::{AnyGrammar, TypedGrammar};
13
14#[cfg(feature = "sqlite")]
15use crate::sqlite::grammar::Grammar;
16#[cfg(feature = "sqlite")]
17use crate::sqlite::tokens::TokenType;
18
19// ── Public API ───────────────────────────────────────────────────────────────
20
21/// High-level tokenizer for `SQLite` SQL.
22///
23/// In most codebases this is the tokenizer you want.
24///
25/// - Fast lexical analysis without building an AST.
26/// - Returns token kind + original source slice.
27/// - Reusable across many SQL inputs.
28///
29/// Advanced generic tokenizer APIs exist in [`crate::typed`] and [`crate::any`].
30#[cfg(feature = "sqlite")]
31#[doc(hidden)]
32pub struct Tokenizer(TypedTokenizer<Grammar>);
33
34#[cfg(feature = "sqlite")]
35impl Tokenizer {
36 /// Create a tokenizer for `SQLite` SQL.
37 pub fn new() -> Self {
38 Tokenizer(TypedTokenizer::new(crate::sqlite::grammar::grammar()))
39 }
40
41 /// Tokenize one SQL source string and iterate `SQLite` tokens.
42 ///
43 /// # Examples
44 ///
45 /// ```rust
46 /// use syntaqlite_syntax::{Tokenizer, TokenType};
47 ///
48 /// let tokenizer = Tokenizer::new();
49 /// let tokens: Vec<_> = tokenizer
50 /// .tokenize("SELECT x FROM t")
51 /// .map(|tok| (tok.token_type(), tok.text().to_string()))
52 /// .collect();
53 ///
54 /// assert!(tokens.iter().any(|(ty, _)| *ty == TokenType::Select));
55 /// assert!(tokens.iter().any(|(_, text)| text == "x"));
56 /// ```
57 ///
58 /// # Panics
59 ///
60 /// Panics if another cursor from this tokenizer is still active.
61 /// Drop the previous iterator before starting a new one.
62 pub fn tokenize<'a>(&self, source: &'a str) -> impl Iterator<Item = Token<'a>> {
63 self.0.tokenize(source).map(Token)
64 }
65
66 /// Zero-copy tokenization over a null-terminated source buffer.
67 ///
68 /// Use this when your SQL already lives in a [`CStr`] and you want to
69 /// avoid copying.
70 ///
71 /// # Examples
72 ///
73 /// ```rust
74 /// use std::ffi::CString;
75 /// use syntaqlite_syntax::{Tokenizer, TokenType};
76 ///
77 /// let tokenizer = Tokenizer::new();
78 /// let sql = CString::new("SELECT 1").unwrap();
79 /// let types: Vec<_> = tokenizer.tokenize_cstr(&sql).map(|t| t.token_type()).collect();
80 ///
81 /// assert!(types.contains(&TokenType::Select));
82 /// ```
83 ///
84 /// # Panics
85 ///
86 /// Panics if another cursor from this tokenizer is still active,
87 /// or if `source` is not valid UTF-8.
88 pub fn tokenize_cstr<'a>(&self, source: &'a CStr) -> impl Iterator<Item = Token<'a>> {
89 self.0.tokenize_cstr(source).map(Token)
90 }
91}
92
93#[cfg(feature = "sqlite")]
94impl Default for Tokenizer {
95 fn default() -> Self {
96 Self::new()
97 }
98}
99
100/// Token emitted by [`Tokenizer`](crate::Tokenizer), including kind and source slice.
101///
102/// Typical usage:
103///
104/// - Inspect token kind via [`token_type`](Self::token_type).
105/// - Read exact source text via [`text`](Self::text).
106#[cfg(feature = "sqlite")]
107#[doc(hidden)]
108pub struct Token<'a>(TypedToken<'a, Grammar>);
109
110#[cfg(feature = "sqlite")]
111impl<'a> Token<'a> {
112 /// The `SQLite` token type.
113 pub fn token_type(&self) -> TokenType {
114 self.0.token_type()
115 }
116
117 /// The source text slice covered by this token.
118 pub fn text(&self) -> &'a str {
119 self.0.text()
120 }
121}
122
123/// Tokenizer parameterized by grammar type `G`.
124///
125/// Useful for reusable tooling built against generated grammars.
126///
127/// - Use this when grammar type is known at compile time.
128/// - Use [`Tokenizer`] for typical `SQLite` SQL app code.
129///
130pub struct TypedTokenizer<G: TypedGrammar> {
131 inner: Rc<RefCell<Option<TokenizerInner>>>,
132 _marker: PhantomData<G>,
133}
134
135impl<G: TypedGrammar> TypedTokenizer<G> {
136 /// Create a tokenizer for grammar `G`.
137 ///
138 /// # Examples
139 ///
140 /// ```rust
141 /// use syntaqlite_syntax::typed::{grammar, TypedTokenizer};
142 ///
143 /// let _tokenizer = TypedTokenizer::new(grammar());
144 /// ```
145 ///
146 /// # Panics
147 ///
148 /// Panics if tokenizer allocation fails (out of memory).
149 pub fn new(grammar: G) -> Self {
150 // SAFETY: create(NULL, grammar.inner) allocates a new tokenizer with
151 // default malloc/free. The C side copies the grammar.
152 let raw = NonNull::new(unsafe {
153 ffi::CTokenizer::create(std::ptr::null(), Into::<AnyGrammar>::into(grammar).inner)
154 })
155 .expect("tokenizer allocation failed");
156
157 TypedTokenizer {
158 inner: Rc::new(RefCell::new(Some(TokenizerInner {
159 raw,
160 source_buf: Vec::new(),
161 }))),
162 _marker: PhantomData,
163 }
164 }
165
166 /// Tokenize source and iterate typed tokens.
167 ///
168 /// The source is copied; the original does not need to outlive the iterator.
169 /// For zero-copy tokenization use [`tokenize_cstr`](Self::tokenize_cstr).
170 ///
171 /// # Examples
172 ///
173 /// ```rust
174 /// use syntaqlite_syntax::TokenType;
175 /// use syntaqlite_syntax::typed::{grammar, TypedTokenizer};
176 ///
177 /// let tokenizer = TypedTokenizer::new(grammar());
178 /// let tokens: Vec<_> = tokenizer.tokenize("SELECT 1").collect();
179 ///
180 /// assert_eq!(tokens[0].token_type(), TokenType::Select);
181 /// assert_eq!(tokens[0].text(), "SELECT");
182 /// ```
183 ///
184 /// # Panics
185 ///
186 /// Panics if another cursor from this tokenizer is still active.
187 /// Drop the previous iterator before starting a new one.
188 pub fn tokenize<'a>(
189 &self,
190 source: &'a str,
191 ) -> impl Iterator<Item = TypedToken<'a, G>> + use<'a, G> {
192 let mut inner = self
193 .inner
194 .borrow_mut()
195 .take()
196 .expect("TypedTokenizer::tokenize called while a cursor is still active");
197
198 inner.source_buf.clear();
199 inner.source_buf.reserve(source.len() + 1);
200 inner.source_buf.extend_from_slice(source.as_bytes());
201 inner.source_buf.push(0);
202
203 // source_buf has at least one byte (the null terminator just pushed).
204 let c_source_ptr =
205 NonNull::new(inner.source_buf.as_mut_ptr()).expect("source_buf is non-empty");
206 // SAFETY: inner.raw is valid; c_source_ptr points to source_buf which
207 // is null-terminated. source_buf lives inside inner which will be owned
208 // by the cursor.
209 unsafe {
210 inner.raw.as_mut().reset(
211 c_source_ptr.as_ptr() as *const _,
212 #[expect(clippy::cast_possible_truncation)]
213 {
214 source.len() as u32
215 },
216 );
217 }
218 TypedTokenCursor {
219 raw: inner.raw,
220 source,
221 c_source_base: c_source_ptr,
222 inner: Some(inner),
223 slot: Rc::clone(&self.inner),
224 _marker: PhantomData,
225 }
226 }
227
228 /// Zero-copy tokenization over a null-terminated source buffer.
229 ///
230 /// No copy is performed. The source must be valid UTF-8 (panics otherwise).
231 ///
232 /// # Examples
233 ///
234 /// ```rust
235 /// use std::ffi::CString;
236 /// use syntaqlite_syntax::TokenType;
237 /// use syntaqlite_syntax::typed::{grammar, TypedTokenizer};
238 ///
239 /// let tokenizer = TypedTokenizer::new(grammar());
240 /// let sql = CString::new("SELECT 1").unwrap();
241 /// let types: Vec<_> = tokenizer.tokenize_cstr(&sql).map(|t| t.token_type()).collect();
242 ///
243 /// assert!(types.contains(&TokenType::Select));
244 /// ```
245 ///
246 /// # Panics
247 ///
248 /// Panics if another cursor from this tokenizer is still active,
249 /// or if `source` is not valid UTF-8.
250 pub fn tokenize_cstr<'a>(
251 &self,
252 source: &'a CStr,
253 ) -> impl Iterator<Item = TypedToken<'a, G>> + use<'a, G> {
254 let mut inner = self
255 .inner
256 .borrow_mut()
257 .take()
258 .expect("TypedTokenizer::tokenize_cstr called while a cursor is still active");
259
260 let bytes = source.to_bytes();
261 let source_str = std::str::from_utf8(bytes).expect("source must be valid UTF-8");
262
263 // SAFETY: inner.raw is valid; source is a CStr (null-terminated, valid for 'a).
264 unsafe {
265 inner.raw.as_mut().reset(
266 source.as_ptr(),
267 #[expect(clippy::cast_possible_truncation)]
268 {
269 bytes.len() as u32
270 },
271 );
272 };
273 TypedTokenCursor {
274 raw: inner.raw,
275 source: source_str,
276 c_source_base: NonNull::new(source.as_ptr() as *mut u8).expect("CStr is non-null"),
277 inner: Some(inner),
278 slot: Rc::clone(&self.inner),
279 _marker: PhantomData,
280 }
281 }
282}
283
284/// Token value shared by typed and SQLite-specific tokenizer APIs.
285///
286/// Provides:
287///
288/// - Grammar-typed token kind.
289/// - Exact source text slice.
290#[derive(Debug, Clone, Copy)]
291pub struct TypedToken<'a, G: TypedGrammar> {
292 token_type: G::Token,
293 text: &'a str,
294}
295
296impl<'a, G: TypedGrammar> TypedToken<'a, G> {
297 /// The grammar-typed token variant.
298 pub fn token_type(&self) -> G::Token {
299 self.token_type
300 }
301
302 /// The source text slice covered by this token.
303 pub fn text(&self) -> &'a str {
304 self.text
305 }
306}
307
308/// Tokenizer alias for grammar-independent code that picks grammar at runtime.
309///
310/// This is a type alias for [`TypedTokenizer<AnyGrammar>`].
311pub type AnyTokenizer = TypedTokenizer<AnyGrammar>;
312
313/// Token alias for grammar-independent tokenization pipelines.
314pub type AnyToken<'a> = TypedToken<'a, AnyGrammar>;
315
316// ── Crate-internal ───────────────────────────────────────────────────────────
317
318/// An iterator over tokens produced by [`TypedTokenizer::tokenize`] or [`TypedTokenizer::tokenize_cstr`].
319///
320/// Returned by the `tokenize` family of methods on [`TypedTokenizer`] and [`AnyTokenizer`].
321/// Implements [`Iterator`]`<Item = `[`TypedToken`]`<'a, G>>`.
322struct TypedTokenCursor<'a, G: TypedGrammar> {
323 raw: NonNull<ffi::CTokenizer>,
324 source: &'a str,
325 /// Base pointer of the C source buffer. Used to compute byte offsets back
326 /// into the Rust `source` slice.
327 c_source_base: NonNull<u8>,
328 inner: Option<TokenizerInner>,
329 slot: Rc<RefCell<Option<TokenizerInner>>>,
330 _marker: PhantomData<G>,
331}
332
333impl<G: TypedGrammar> Drop for TypedTokenCursor<'_, G> {
334 fn drop(&mut self) {
335 if let Some(inner) = self.inner.take() {
336 *self.slot.borrow_mut() = Some(inner);
337 }
338 }
339}
340
341impl<'a, G: TypedGrammar> Iterator for TypedTokenCursor<'a, G> {
342 type Item = TypedToken<'a, G>;
343
344 fn next(&mut self) -> Option<Self::Item> {
345 loop {
346 let mut token = ffi::CToken {
347 text: std::ptr::null(),
348 length: 0,
349 type_: 0,
350 };
351 // SAFETY: self.raw is valid (owned by TokenizerInner in self.inner);
352 // &mut token is a valid output parameter.
353 let rc = unsafe { self.raw.as_mut().next(&raw mut token) };
354 if rc == 0 {
355 return None;
356 }
357
358 if let Some(token_type) = G::Token::from_token_type(AnyTokenType(token.type_)) {
359 // Compute offset into the source string from the C pointer.
360 let offset = token.text as usize - self.c_source_base.as_ptr() as usize;
361 let text = &self.source[offset..offset + token.length as usize];
362 return Some(TypedToken { token_type, text });
363 }
364 }
365 }
366}
367
368pub(crate) struct TokenizerInner {
369 raw: NonNull<ffi::CTokenizer>,
370 source_buf: Vec<u8>,
371}
372
373impl Drop for TokenizerInner {
374 fn drop(&mut self) {
375 // SAFETY: self.raw was allocated by syntaqlite_tokenizer_create and has
376 // not been freed (Drop runs exactly once).
377 unsafe { ffi::CTokenizer::destroy(self.raw.as_ptr()) }
378 }
379}
380
381// ── ffi ───────────────────────────────────────────────────────────────────────
382
383mod ffi {
384 use std::ffi::c_char;
385
386 /// Opaque C tokenizer type.
387 pub(crate) enum CTokenizer {}
388
389 impl CTokenizer {
390 pub(crate) unsafe fn create(
391 mem: *const std::ffi::c_void,
392 grammar: crate::grammar::ffi::CGrammar,
393 ) -> *mut Self {
394 // SAFETY: caller guarantees `mem` is null or a valid mem-methods
395 // pointer; `grammar` is a valid grammar descriptor.
396 unsafe { syntaqlite_tokenizer_create_with_grammar(mem, grammar) }
397 }
398
399 pub(crate) unsafe fn reset(&mut self, source: *const c_char, len: u32) {
400 // SAFETY: caller guarantees `self` is valid and `source` points to
401 // at least `len` bytes of valid, null-terminated C string data.
402 unsafe { syntaqlite_tokenizer_reset(self, source, len) }
403 }
404
405 pub(crate) unsafe fn next(&mut self, out: *mut CToken) -> u32 {
406 // SAFETY: caller guarantees `self` is valid after a `reset` call
407 // and `out` is a valid writable pointer to a `CToken`.
408 unsafe { syntaqlite_tokenizer_next(self, out) }
409 }
410
411 pub(crate) unsafe fn destroy(this: *mut Self) {
412 // SAFETY: caller guarantees `this` was allocated by `create` and
413 // has not been freed yet (called exactly once from `Drop`).
414 unsafe { syntaqlite_tokenizer_destroy(this) }
415 }
416 }
417
418 /// A single token produced by the C tokenizer.
419 ///
420 /// Mirrors C `SyntaqliteToken` from `include/syntaqlite/tokenizer.h`.
421 #[repr(C)]
422 pub(crate) struct CToken {
423 pub(crate) text: *const c_char,
424 pub(crate) length: u32,
425 pub(crate) type_: u32,
426 }
427
428 unsafe extern "C" {
429 fn syntaqlite_tokenizer_create_with_grammar(
430 mem: *const std::ffi::c_void,
431 grammar: crate::grammar::ffi::CGrammar,
432 ) -> *mut CTokenizer;
433 fn syntaqlite_tokenizer_reset(tok: *mut CTokenizer, source: *const c_char, len: u32);
434 fn syntaqlite_tokenizer_next(tok: *mut CTokenizer, out: *mut CToken) -> u32;
435 fn syntaqlite_tokenizer_destroy(tok: *mut CTokenizer);
436 }
437}
438
439#[cfg(all(test, feature = "sqlite"))]
440mod tests {
441 use std::ffi::CString;
442 use std::panic::{self, AssertUnwindSafe};
443
444 use super::{TokenType, Tokenizer};
445
446 #[test]
447 fn tokenizer_emits_expected_core_tokens() {
448 let tokenizer = Tokenizer::new();
449 let tokens: Vec<_> = tokenizer
450 .tokenize("SELECT x, 1 FROM t;")
451 .filter(|token| !matches!(token.token_type(), TokenType::Space | TokenType::Comment))
452 .map(|token| (token.token_type(), token.text().to_owned()))
453 .collect();
454
455 assert_eq!(
456 tokens,
457 vec![
458 (TokenType::Select, "SELECT".to_owned()),
459 (TokenType::Id, "x".to_owned()),
460 (TokenType::Comma, ",".to_owned()),
461 (TokenType::Integer, "1".to_owned()),
462 (TokenType::From, "FROM".to_owned()),
463 (TokenType::Id, "t".to_owned()),
464 (TokenType::Semi, ";".to_owned()),
465 ]
466 );
467 }
468
469 #[test]
470 fn tokenizer_cstr_matches_str_path() {
471 let source = CString::new("SELECT 1;").expect("source has no interior NUL");
472 let tokenizer = Tokenizer::new();
473
474 let from_str: Vec<_> = tokenizer
475 .tokenize(source.to_str().expect("source is UTF-8"))
476 .map(|token| (token.token_type(), token.text().to_owned()))
477 .collect();
478
479 let from_cstr: Vec<_> = tokenizer
480 .tokenize_cstr(source.as_c_str())
481 .map(|token| (token.token_type(), token.text().to_owned()))
482 .collect();
483
484 assert_eq!(from_str, from_cstr);
485 }
486
487 #[test]
488 fn tokenizer_allows_only_one_live_cursor() {
489 let tokenizer = Tokenizer::new();
490 let mut cursor = tokenizer.tokenize("SELECT 1;");
491 assert!(cursor.next().is_some());
492
493 let reentrant_attempt = panic::catch_unwind(AssertUnwindSafe(|| {
494 let _cursor = tokenizer.tokenize("SELECT 2;");
495 }));
496 assert!(reentrant_attempt.is_err());
497
498 drop(cursor);
499 let second_count = tokenizer.tokenize("SELECT 2;").count();
500 assert!(second_count > 0);
501 }
502}