Skip to main content

syntaqlite_syntax/
tokenizer.rs

1// Copyright 2025 The syntaqlite Authors. All rights reserved.
2// Licensed under the Apache License, Version 2.0.
3
4use std::cell::RefCell;
5use std::ffi::CStr;
6use std::marker::PhantomData;
7use std::ptr::NonNull;
8use std::rc::Rc;
9
10use crate::any::AnyTokenType;
11use crate::ast::GrammarTokenType;
12use crate::grammar::{AnyGrammar, TypedGrammar};
13
14#[cfg(feature = "sqlite")]
15use crate::sqlite::grammar::Grammar;
16#[cfg(feature = "sqlite")]
17use crate::sqlite::tokens::TokenType;
18
19// ── Public API ───────────────────────────────────────────────────────────────
20
21/// High-level tokenizer for `SQLite` SQL.
22///
23/// In most codebases this is the tokenizer you want.
24///
25/// - Fast lexical analysis without building an AST.
26/// - Returns token kind + original source slice.
27/// - Reusable across many SQL inputs.
28///
29/// Advanced generic tokenizer APIs exist in [`crate::typed`] and [`crate::any`].
30#[cfg(feature = "sqlite")]
31#[doc(hidden)]
32pub struct Tokenizer(TypedTokenizer<Grammar>);
33
34#[cfg(feature = "sqlite")]
35impl Tokenizer {
36    /// Create a tokenizer for `SQLite` SQL.
37    pub fn new() -> Self {
38        Tokenizer(TypedTokenizer::new(crate::sqlite::grammar::grammar()))
39    }
40
41    /// Tokenize one SQL source string and iterate `SQLite` tokens.
42    ///
43    /// # Examples
44    ///
45    /// ```rust
46    /// use syntaqlite_syntax::{Tokenizer, TokenType};
47    ///
48    /// let tokenizer = Tokenizer::new();
49    /// let tokens: Vec<_> = tokenizer
50    ///     .tokenize("SELECT x FROM t")
51    ///     .map(|tok| (tok.token_type(), tok.text().to_string()))
52    ///     .collect();
53    ///
54    /// assert!(tokens.iter().any(|(ty, _)| *ty == TokenType::Select));
55    /// assert!(tokens.iter().any(|(_, text)| text == "x"));
56    /// ```
57    ///
58    /// # Panics
59    ///
60    /// Panics if another cursor from this tokenizer is still active.
61    /// Drop the previous iterator before starting a new one.
62    pub fn tokenize<'a>(&self, source: &'a str) -> impl Iterator<Item = Token<'a>> {
63        self.0.tokenize(source).map(Token)
64    }
65
66    /// Zero-copy tokenization over a null-terminated source buffer.
67    ///
68    /// Use this when your SQL already lives in a [`CStr`] and you want to
69    /// avoid copying.
70    ///
71    /// # Examples
72    ///
73    /// ```rust
74    /// use std::ffi::CString;
75    /// use syntaqlite_syntax::{Tokenizer, TokenType};
76    ///
77    /// let tokenizer = Tokenizer::new();
78    /// let sql = CString::new("SELECT 1").unwrap();
79    /// let types: Vec<_> = tokenizer.tokenize_cstr(&sql).map(|t| t.token_type()).collect();
80    ///
81    /// assert!(types.contains(&TokenType::Select));
82    /// ```
83    ///
84    /// # Panics
85    ///
86    /// Panics if another cursor from this tokenizer is still active,
87    /// or if `source` is not valid UTF-8.
88    pub fn tokenize_cstr<'a>(&self, source: &'a CStr) -> impl Iterator<Item = Token<'a>> {
89        self.0.tokenize_cstr(source).map(Token)
90    }
91}
92
93#[cfg(feature = "sqlite")]
94impl Default for Tokenizer {
95    fn default() -> Self {
96        Self::new()
97    }
98}
99
100/// Token emitted by [`Tokenizer`](crate::Tokenizer), including kind and source slice.
101///
102/// Typical usage:
103///
104/// - Inspect token kind via [`token_type`](Self::token_type).
105/// - Read exact source text via [`text`](Self::text).
106#[cfg(feature = "sqlite")]
107#[doc(hidden)]
108pub struct Token<'a>(TypedToken<'a, Grammar>);
109
110#[cfg(feature = "sqlite")]
111impl<'a> Token<'a> {
112    /// The `SQLite` token type.
113    pub fn token_type(&self) -> TokenType {
114        self.0.token_type()
115    }
116
117    /// The source text slice covered by this token.
118    pub fn text(&self) -> &'a str {
119        self.0.text()
120    }
121}
122
123/// Tokenizer parameterized by grammar type `G`.
124///
125/// Useful for reusable tooling built against generated grammars.
126///
127/// - Use this when grammar type is known at compile time.
128/// - Use [`Tokenizer`] for typical `SQLite` SQL app code.
129///
130pub struct TypedTokenizer<G: TypedGrammar> {
131    inner: Rc<RefCell<Option<TokenizerInner>>>,
132    _marker: PhantomData<G>,
133}
134
135impl<G: TypedGrammar> TypedTokenizer<G> {
136    /// Create a tokenizer for grammar `G`.
137    ///
138    /// # Examples
139    ///
140    /// ```rust
141    /// use syntaqlite_syntax::typed::{grammar, TypedTokenizer};
142    ///
143    /// let _tokenizer = TypedTokenizer::new(grammar());
144    /// ```
145    ///
146    /// # Panics
147    ///
148    /// Panics if tokenizer allocation fails (out of memory).
149    pub fn new(grammar: G) -> Self {
150        // SAFETY: create(NULL, grammar.inner) allocates a new tokenizer with
151        // default malloc/free. The C side copies the grammar.
152        let raw = NonNull::new(unsafe {
153            ffi::CTokenizer::create(std::ptr::null(), Into::<AnyGrammar>::into(grammar).inner)
154        })
155        .expect("tokenizer allocation failed");
156
157        TypedTokenizer {
158            inner: Rc::new(RefCell::new(Some(TokenizerInner {
159                raw,
160                source_buf: Vec::new(),
161            }))),
162            _marker: PhantomData,
163        }
164    }
165
166    /// Tokenize source and iterate typed tokens.
167    ///
168    /// The source is copied; the original does not need to outlive the iterator.
169    /// For zero-copy tokenization use [`tokenize_cstr`](Self::tokenize_cstr).
170    ///
171    /// # Examples
172    ///
173    /// ```rust
174    /// use syntaqlite_syntax::TokenType;
175    /// use syntaqlite_syntax::typed::{grammar, TypedTokenizer};
176    ///
177    /// let tokenizer = TypedTokenizer::new(grammar());
178    /// let tokens: Vec<_> = tokenizer.tokenize("SELECT 1").collect();
179    ///
180    /// assert_eq!(tokens[0].token_type(), TokenType::Select);
181    /// assert_eq!(tokens[0].text(), "SELECT");
182    /// ```
183    ///
184    /// # Panics
185    ///
186    /// Panics if another cursor from this tokenizer is still active.
187    /// Drop the previous iterator before starting a new one.
188    pub fn tokenize<'a>(
189        &self,
190        source: &'a str,
191    ) -> impl Iterator<Item = TypedToken<'a, G>> + use<'a, G> {
192        let mut inner = self
193            .inner
194            .borrow_mut()
195            .take()
196            .expect("TypedTokenizer::tokenize called while a cursor is still active");
197
198        inner.source_buf.clear();
199        inner.source_buf.reserve(source.len() + 1);
200        inner.source_buf.extend_from_slice(source.as_bytes());
201        inner.source_buf.push(0);
202
203        // source_buf has at least one byte (the null terminator just pushed).
204        let c_source_ptr =
205            NonNull::new(inner.source_buf.as_mut_ptr()).expect("source_buf is non-empty");
206        // SAFETY: inner.raw is valid; c_source_ptr points to source_buf which
207        // is null-terminated. source_buf lives inside inner which will be owned
208        // by the cursor.
209        unsafe {
210            inner.raw.as_mut().reset(
211                c_source_ptr.as_ptr() as *const _,
212                #[expect(clippy::cast_possible_truncation)]
213                {
214                    source.len() as u32
215                },
216            );
217        }
218        TypedTokenCursor {
219            raw: inner.raw,
220            source,
221            c_source_base: c_source_ptr,
222            inner: Some(inner),
223            slot: Rc::clone(&self.inner),
224            _marker: PhantomData,
225        }
226    }
227
228    /// Zero-copy tokenization over a null-terminated source buffer.
229    ///
230    /// No copy is performed. The source must be valid UTF-8 (panics otherwise).
231    ///
232    /// # Examples
233    ///
234    /// ```rust
235    /// use std::ffi::CString;
236    /// use syntaqlite_syntax::TokenType;
237    /// use syntaqlite_syntax::typed::{grammar, TypedTokenizer};
238    ///
239    /// let tokenizer = TypedTokenizer::new(grammar());
240    /// let sql = CString::new("SELECT 1").unwrap();
241    /// let types: Vec<_> = tokenizer.tokenize_cstr(&sql).map(|t| t.token_type()).collect();
242    ///
243    /// assert!(types.contains(&TokenType::Select));
244    /// ```
245    ///
246    /// # Panics
247    ///
248    /// Panics if another cursor from this tokenizer is still active,
249    /// or if `source` is not valid UTF-8.
250    pub fn tokenize_cstr<'a>(
251        &self,
252        source: &'a CStr,
253    ) -> impl Iterator<Item = TypedToken<'a, G>> + use<'a, G> {
254        let mut inner = self
255            .inner
256            .borrow_mut()
257            .take()
258            .expect("TypedTokenizer::tokenize_cstr called while a cursor is still active");
259
260        let bytes = source.to_bytes();
261        let source_str = std::str::from_utf8(bytes).expect("source must be valid UTF-8");
262
263        // SAFETY: inner.raw is valid; source is a CStr (null-terminated, valid for 'a).
264        unsafe {
265            inner.raw.as_mut().reset(
266                source.as_ptr(),
267                #[expect(clippy::cast_possible_truncation)]
268                {
269                    bytes.len() as u32
270                },
271            );
272        };
273        TypedTokenCursor {
274            raw: inner.raw,
275            source: source_str,
276            c_source_base: NonNull::new(source.as_ptr() as *mut u8).expect("CStr is non-null"),
277            inner: Some(inner),
278            slot: Rc::clone(&self.inner),
279            _marker: PhantomData,
280        }
281    }
282}
283
284/// Token value shared by typed and SQLite-specific tokenizer APIs.
285///
286/// Provides:
287///
288/// - Grammar-typed token kind.
289/// - Exact source text slice.
290#[derive(Debug, Clone, Copy)]
291pub struct TypedToken<'a, G: TypedGrammar> {
292    token_type: G::Token,
293    text: &'a str,
294}
295
296impl<'a, G: TypedGrammar> TypedToken<'a, G> {
297    /// The grammar-typed token variant.
298    pub fn token_type(&self) -> G::Token {
299        self.token_type
300    }
301
302    /// The source text slice covered by this token.
303    pub fn text(&self) -> &'a str {
304        self.text
305    }
306}
307
308/// Tokenizer alias for grammar-independent code that picks grammar at runtime.
309///
310/// This is a type alias for [`TypedTokenizer<AnyGrammar>`].
311pub type AnyTokenizer = TypedTokenizer<AnyGrammar>;
312
313/// Token alias for grammar-independent tokenization pipelines.
314pub type AnyToken<'a> = TypedToken<'a, AnyGrammar>;
315
316// ── Crate-internal ───────────────────────────────────────────────────────────
317
318/// An iterator over tokens produced by [`TypedTokenizer::tokenize`] or [`TypedTokenizer::tokenize_cstr`].
319///
320/// Returned by the `tokenize` family of methods on [`TypedTokenizer`] and [`AnyTokenizer`].
321/// Implements [`Iterator`]`<Item = `[`TypedToken`]`<'a, G>>`.
322struct TypedTokenCursor<'a, G: TypedGrammar> {
323    raw: NonNull<ffi::CTokenizer>,
324    source: &'a str,
325    /// Base pointer of the C source buffer. Used to compute byte offsets back
326    /// into the Rust `source` slice.
327    c_source_base: NonNull<u8>,
328    inner: Option<TokenizerInner>,
329    slot: Rc<RefCell<Option<TokenizerInner>>>,
330    _marker: PhantomData<G>,
331}
332
333impl<G: TypedGrammar> Drop for TypedTokenCursor<'_, G> {
334    fn drop(&mut self) {
335        if let Some(inner) = self.inner.take() {
336            *self.slot.borrow_mut() = Some(inner);
337        }
338    }
339}
340
341impl<'a, G: TypedGrammar> Iterator for TypedTokenCursor<'a, G> {
342    type Item = TypedToken<'a, G>;
343
344    fn next(&mut self) -> Option<Self::Item> {
345        loop {
346            let mut token = ffi::CToken {
347                text: std::ptr::null(),
348                length: 0,
349                type_: 0,
350            };
351            // SAFETY: self.raw is valid (owned by TokenizerInner in self.inner);
352            // &mut token is a valid output parameter.
353            let rc = unsafe { self.raw.as_mut().next(&raw mut token) };
354            if rc == 0 {
355                return None;
356            }
357
358            if let Some(token_type) = G::Token::from_token_type(AnyTokenType(token.type_)) {
359                // Compute offset into the source string from the C pointer.
360                let offset = token.text as usize - self.c_source_base.as_ptr() as usize;
361                let text = &self.source[offset..offset + token.length as usize];
362                return Some(TypedToken { token_type, text });
363            }
364        }
365    }
366}
367
368pub(crate) struct TokenizerInner {
369    raw: NonNull<ffi::CTokenizer>,
370    source_buf: Vec<u8>,
371}
372
373impl Drop for TokenizerInner {
374    fn drop(&mut self) {
375        // SAFETY: self.raw was allocated by syntaqlite_tokenizer_create and has
376        // not been freed (Drop runs exactly once).
377        unsafe { ffi::CTokenizer::destroy(self.raw.as_ptr()) }
378    }
379}
380
381// ── ffi ───────────────────────────────────────────────────────────────────────
382
383mod ffi {
384    use std::ffi::c_char;
385
386    /// Opaque C tokenizer type.
387    pub(crate) enum CTokenizer {}
388
389    impl CTokenizer {
390        pub(crate) unsafe fn create(
391            mem: *const std::ffi::c_void,
392            grammar: crate::grammar::ffi::CGrammar,
393        ) -> *mut Self {
394            // SAFETY: caller guarantees `mem` is null or a valid mem-methods
395            // pointer; `grammar` is a valid grammar descriptor.
396            unsafe { syntaqlite_tokenizer_create_with_grammar(mem, grammar) }
397        }
398
399        pub(crate) unsafe fn reset(&mut self, source: *const c_char, len: u32) {
400            // SAFETY: caller guarantees `self` is valid and `source` points to
401            // at least `len` bytes of valid, null-terminated C string data.
402            unsafe { syntaqlite_tokenizer_reset(self, source, len) }
403        }
404
405        pub(crate) unsafe fn next(&mut self, out: *mut CToken) -> u32 {
406            // SAFETY: caller guarantees `self` is valid after a `reset` call
407            // and `out` is a valid writable pointer to a `CToken`.
408            unsafe { syntaqlite_tokenizer_next(self, out) }
409        }
410
411        pub(crate) unsafe fn destroy(this: *mut Self) {
412            // SAFETY: caller guarantees `this` was allocated by `create` and
413            // has not been freed yet (called exactly once from `Drop`).
414            unsafe { syntaqlite_tokenizer_destroy(this) }
415        }
416    }
417
418    /// A single token produced by the C tokenizer.
419    ///
420    /// Mirrors C `SyntaqliteToken` from `include/syntaqlite/tokenizer.h`.
421    #[repr(C)]
422    pub(crate) struct CToken {
423        pub(crate) text: *const c_char,
424        pub(crate) length: u32,
425        pub(crate) type_: u32,
426    }
427
428    unsafe extern "C" {
429        fn syntaqlite_tokenizer_create_with_grammar(
430            mem: *const std::ffi::c_void,
431            grammar: crate::grammar::ffi::CGrammar,
432        ) -> *mut CTokenizer;
433        fn syntaqlite_tokenizer_reset(tok: *mut CTokenizer, source: *const c_char, len: u32);
434        fn syntaqlite_tokenizer_next(tok: *mut CTokenizer, out: *mut CToken) -> u32;
435        fn syntaqlite_tokenizer_destroy(tok: *mut CTokenizer);
436    }
437}
438
439#[cfg(all(test, feature = "sqlite"))]
440mod tests {
441    use std::ffi::CString;
442    use std::panic::{self, AssertUnwindSafe};
443
444    use super::{TokenType, Tokenizer};
445
446    #[test]
447    fn tokenizer_emits_expected_core_tokens() {
448        let tokenizer = Tokenizer::new();
449        let tokens: Vec<_> = tokenizer
450            .tokenize("SELECT x, 1 FROM t;")
451            .filter(|token| !matches!(token.token_type(), TokenType::Space | TokenType::Comment))
452            .map(|token| (token.token_type(), token.text().to_owned()))
453            .collect();
454
455        assert_eq!(
456            tokens,
457            vec![
458                (TokenType::Select, "SELECT".to_owned()),
459                (TokenType::Id, "x".to_owned()),
460                (TokenType::Comma, ",".to_owned()),
461                (TokenType::Integer, "1".to_owned()),
462                (TokenType::From, "FROM".to_owned()),
463                (TokenType::Id, "t".to_owned()),
464                (TokenType::Semi, ";".to_owned()),
465            ]
466        );
467    }
468
469    #[test]
470    fn tokenizer_cstr_matches_str_path() {
471        let source = CString::new("SELECT 1;").expect("source has no interior NUL");
472        let tokenizer = Tokenizer::new();
473
474        let from_str: Vec<_> = tokenizer
475            .tokenize(source.to_str().expect("source is UTF-8"))
476            .map(|token| (token.token_type(), token.text().to_owned()))
477            .collect();
478
479        let from_cstr: Vec<_> = tokenizer
480            .tokenize_cstr(source.as_c_str())
481            .map(|token| (token.token_type(), token.text().to_owned()))
482            .collect();
483
484        assert_eq!(from_str, from_cstr);
485    }
486
487    #[test]
488    fn tokenizer_allows_only_one_live_cursor() {
489        let tokenizer = Tokenizer::new();
490        let mut cursor = tokenizer.tokenize("SELECT 1;");
491        assert!(cursor.next().is_some());
492
493        let reentrant_attempt = panic::catch_unwind(AssertUnwindSafe(|| {
494            let _cursor = tokenizer.tokenize("SELECT 2;");
495        }));
496        assert!(reentrant_attempt.is_err());
497
498        drop(cursor);
499        let second_count = tokenizer.tokenize("SELECT 2;").count();
500        assert!(second_count > 0);
501    }
502}