Skip to main content

syntaqlite_syntax/
grammar.rs

1// Copyright 2025 The syntaqlite Authors. All rights reserved.
2// Licensed under the Apache License, Version 2.0.
3
4// ── Public API ───────────────────────────────────────────────────────────────
5
6use crate::any::{AnyNodeTag, AnyTokenType};
7use crate::util::{SqliteSyntaxFlags, SqliteVersion};
8
9/// Runtime field-value shape used when reflecting over AST nodes.
10///
11/// This powers grammar-agnostic tooling that inspects nodes without generated
12/// Rust types.
13#[repr(u8)]
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum FieldKind {
16    /// A child node identifier.
17    NodeId = 0,
18    /// A source span (byte offset + length).
19    Span = 1,
20    /// A boolean flag.
21    Bool = 2,
22    /// A compact bitfield of flags.
23    Flags = 3,
24    /// A discriminant for an enum variant.
25    Enum = 4,
26}
27
28impl FieldKind {
29    fn from_u8(v: u8) -> Self {
30        match v {
31            1 => FieldKind::Span,
32            2 => FieldKind::Bool,
33            3 => FieldKind::Flags,
34            4 => FieldKind::Enum,
35            _ => FieldKind::NodeId,
36        }
37    }
38}
39
40/// High-level semantic class of a token.
41///
42/// Commonly used for syntax highlighting, token styling, and lightweight
43/// heuristics before full semantic analysis.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum TokenCategory {
46    /// SQL keyword (SELECT, FROM, WHERE, …)
47    Keyword,
48    /// Bind parameter or session variable (`:name`, `@var`, `?`)
49    Parameter,
50    /// String literal or blob literal
51    String,
52    /// Numeric literal
53    Number,
54    /// Operator or comparison symbol (`+`, `=`, `||`, …)
55    Operator,
56    /// Comment (`-- …` or `/* … */`)
57    Comment,
58    /// Punctuation (`,`, `(`, `)`, `;`, …)
59    Punctuation,
60    /// Quoted or unquoted identifier
61    Identifier,
62    /// Built-in or user-defined function name
63    Function,
64    /// Type name (in CAST, column definitions, …)
65    Type,
66    /// Anything that doesn't fall into the above categories
67    Other,
68}
69
70impl From<ffi::CTokenCategory> for TokenCategory {
71    fn from(c: ffi::CTokenCategory) -> Self {
72        match c {
73            ffi::CTokenCategory::Keyword => Self::Keyword,
74            ffi::CTokenCategory::Identifier => Self::Identifier,
75            ffi::CTokenCategory::String => Self::String,
76            ffi::CTokenCategory::Number => Self::Number,
77            ffi::CTokenCategory::Operator => Self::Operator,
78            ffi::CTokenCategory::Punctuation => Self::Punctuation,
79            ffi::CTokenCategory::Comment => Self::Comment,
80            ffi::CTokenCategory::Variable => Self::Parameter,
81            ffi::CTokenCategory::Function => Self::Function,
82            ffi::CTokenCategory::Type => Self::Type,
83            ffi::CTokenCategory::Other => Self::Other,
84        }
85    }
86}
87
88/// Metadata for one AST field of one node type.
89///
90/// Use this to build generic inspectors, serializers, or debug UIs that can
91/// walk arbitrary grammars.
92pub struct FieldMeta<'a>(pub(crate) &'a ffi::CFieldMeta);
93
94impl FieldMeta<'_> {
95    /// Byte offset of this field within its parent AST node struct.
96    pub fn offset(&self) -> u16 {
97        self.0.offset
98    }
99
100    /// Semantic kind of this field.
101    pub fn kind(&self) -> FieldKind {
102        FieldKind::from_u8(self.0.kind)
103    }
104
105    /// The field name as a `&str`.
106    ///
107    /// # Panics
108    /// Panics if the grammar table contains invalid UTF-8 in the field name
109    /// (which would indicate a codegen bug).
110    pub fn name(&self) -> &'static str {
111        // SAFETY: `FieldMeta` is only constructed from static grammar tables
112        // where `name` is always a valid, NUL-terminated UTF-8 C string.
113        unsafe {
114            let cstr = std::ffi::CStr::from_ptr(self.0.name);
115            cstr.to_str().expect("invalid UTF-8 in field name")
116        }
117    }
118
119    /// The `idx`-th display name for enum variants, if present.
120    ///
121    /// # Panics
122    /// Panics if the grammar table contains invalid UTF-8 in a display name
123    /// (which would indicate a codegen bug).
124    pub fn display_name(&self, idx: usize) -> Option<&'static str> {
125        if self.0.display.is_null() || idx >= self.0.display_count as usize {
126            return None;
127        }
128        // SAFETY: `FieldMeta` is only constructed from static grammar tables;
129        // `display` and its entries are valid static C strings.
130        unsafe {
131            let ptr = *self.0.display.add(idx);
132            if ptr.is_null() {
133                return None;
134            }
135            let cstr = std::ffi::CStr::from_ptr(ptr);
136            Some(cstr.to_str().expect("invalid UTF-8 in display name"))
137        }
138    }
139
140    /// Number of display names for this field.
141    pub fn display_count(&self) -> usize {
142        self.0.display_count as usize
143    }
144}
145
146/// Parser-inferred semantic usage for an individual token occurrence.
147///
148/// This complements lexical token kind and helps distinguish ambiguous tokens
149/// (for example keyword text used as an identifier).
150#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
151pub struct ParserTokenFlags(u8);
152
153impl ParserTokenFlags {
154    /// Construct from a raw C flag bitfield (`SyntaqliteParserTokenFlags = uint32_t`).
155    pub(crate) fn from_raw(v: u32) -> Self {
156        let bits = u8::try_from(v).expect("parser token flags out of range for u8");
157        ParserTokenFlags(bits)
158    }
159
160    // Bit positions — mirror C SYNQ_TOKEN_FLAG_* in syntaqlite/parser.h.
161    const AS_ID: u8 = 1;
162    const AS_FUNCTION: u8 = 2;
163    const AS_TYPE: u8 = 4;
164
165    /// Returns the underlying flag bits.
166    pub fn bits(self) -> u8 {
167        self.0
168    }
169
170    /// True if the token was used as an identifier (`SYNQ_TOKEN_FLAG_AS_ID`).
171    pub fn used_as_identifier(self) -> bool {
172        self.0 & Self::AS_ID != 0
173    }
174
175    /// True if the token was used as a function name (`SYNQ_TOKEN_FLAG_AS_FUNCTION`).
176    pub fn used_as_function(self) -> bool {
177        self.0 & Self::AS_FUNCTION != 0
178    }
179
180    /// True if the token was used as a type name (`SYNQ_TOKEN_FLAG_AS_TYPE`).
181    pub fn used_as_type(self) -> bool {
182        self.0 & Self::AS_TYPE != 0
183    }
184}
185
186/// Trait implemented by generated grammar handles.
187///
188/// End users typically consume implementations rather than writing them.
189/// The trait links a grammar to its typed node and token enums.
190pub trait TypedGrammar: Clone + Into<AnyGrammar> {
191    /// The top-level typed AST node enum for this grammar.
192    type Node<'a>: crate::ast::GrammarNodeType<'a>;
193    /// The grammar's typed node ID, wrapping an [`crate::ast::AnyNodeId`].
194    ///
195    /// Used as the return type of [`TypedNodeList::node_id`](crate::ast::TypedNodeList::node_id)
196    /// so callers get a grammar-typed handle rather than a raw [`crate::ast::AnyNodeId`].
197    type NodeId: Copy + From<crate::ast::AnyNodeId> + Into<crate::ast::AnyNodeId>;
198    /// The typed token enum for this grammar.
199    type Token: crate::ast::GrammarTokenType;
200}
201
202/// Grammar handle for runtime-configurable, grammar-agnostic workflows.
203///
204/// Use `AnyGrammar` when grammar selection/configuration is dynamic (plugins,
205/// LSP hosts, multi-grammar test harnesses). It carries version/cflag knobs
206/// and introspection metadata, while remaining cheap to clone.
207///
208/// Built-in grammars hold `&'static` C data directly. Dynamically loaded
209/// grammars transmute library-memory pointers to `&'static` and keep the
210/// library alive via an [`Arc`](std::sync::Arc). Use `AnyGrammar::load` to create one.
211#[derive(Clone)]
212pub struct AnyGrammar {
213    pub(crate) inner: ffi::CGrammar,
214    /// Keeps the shared library alive for dynamically-loaded grammars.
215    /// `None` for built-in (static) grammars.
216    _keep_alive: Option<std::sync::Arc<dyn Send + Sync>>,
217}
218
219// SAFETY: The grammar wraps an immutable reference to static C data.
220unsafe impl Send for AnyGrammar {}
221// SAFETY: AnyGrammar wraps a *const CGrammar to a static C grammar object; it is safe to share across threads.
222unsafe impl Sync for AnyGrammar {}
223
224impl AnyGrammar {
225    /// Construct a `AnyGrammar` from a raw C grammar value.\
226    ///
227    /// This unsafe method exists only for use by grammar implementations which are code generated.
228    /// End users should never need to call this directly.
229    ///
230    /// # Safety
231    /// The `template` pointer inside `inner` must point to valid, `'static`
232    /// C grammar tables (e.g. returned by a grammar's `extern "C"` grammar
233    /// accessor such as `syntaqlite_sqlite_grammar()`).
234    pub unsafe fn new(inner: ffi::CGrammar) -> Self {
235        AnyGrammar {
236            inner,
237            _keep_alive: None,
238        }
239    }
240
241    /// Load a grammar from a shared library (`.so` / `.dylib` / `.dll`).
242    ///
243    /// Resolves `syntaqlite_<name>_grammar` (or `syntaqlite_grammar` when `name`
244    /// is `None`) and calls it to obtain the grammar handle.
245    ///
246    /// # Errors
247    /// Returns `Err` if the library cannot be opened or the grammar symbol is absent.
248    ///
249    /// # Library lifetime
250    /// The loaded library is kept alive via an [`Arc`] stored inside the
251    /// returned `AnyGrammar`. Dropping the last clone of the grammar unloads
252    /// the library. Use [`syntaqlite::Dialect::load`] for dialect-level loading.
253    #[cfg(feature = "dynload")]
254    pub fn load(path: &str, name: Option<&str>) -> Result<Self, String> {
255        // SAFETY: We keep `lib` alive in an `Arc` below so the grammar pointer
256        // lives as long as any clone of the returned AnyGrammar.
257        let lib = unsafe {
258            libloading::Library::new(path).map_err(|e| format!("failed to load {path:?}: {e}"))?
259        };
260
261        let symbol = match name {
262            Some(n) => format!("syntaqlite_{n}_grammar"),
263            None => "syntaqlite_grammar".to_string(),
264        };
265        // SAFETY: We call the function immediately and drop `func` before `lib`
266        // is moved into the Arc, so there is no lifetime overlap issue.
267        let raw: ffi::CGrammar = unsafe {
268            let func: libloading::Symbol<'_, unsafe extern "C" fn() -> ffi::CGrammar> = lib
269                .get(symbol.as_bytes())
270                .map_err(|e| format!("symbol {symbol:?} not found in {path:?}: {e}"))?;
271            func()
272        };
273
274        let keep_alive: std::sync::Arc<dyn Send + Sync> = std::sync::Arc::new(lib);
275
276        // SAFETY: `raw.template` points into the shared library kept alive by
277        // `keep_alive`. Dropping the last AnyGrammar clone unloads the library.
278        Ok(AnyGrammar {
279            inner: raw,
280            _keep_alive: Some(keep_alive),
281        })
282    }
283
284    /// Pin this grammar handle to a target `SQLite` version.
285    ///
286    /// Useful when your product must emulate a specific engine release.
287    #[must_use]
288    pub fn with_version(mut self, version: SqliteVersion) -> Self {
289        self.inner.sqlite_version = version.as_int();
290        self
291    }
292
293    /// Replace compile-time compatibility flags on this handle.
294    #[must_use]
295    pub fn with_cflags(mut self, flags: SqliteSyntaxFlags) -> Self {
296        self.inner.cflags = flags.0;
297        self
298    }
299
300    /// Target `SQLite` version currently configured on this handle.
301    pub fn version(&self) -> SqliteVersion {
302        SqliteVersion::from_int(self.inner.sqlite_version)
303    }
304
305    /// Active C-parser compile-time compatibility flags.
306    pub fn cflags(&self) -> SqliteSyntaxFlags {
307        SqliteSyntaxFlags(self.inner.cflags)
308    }
309
310    /// Return a reference to the abstract grammar template.
311    #[inline]
312    fn template(&self) -> &'static ffi::CGrammarTemplate {
313        // SAFETY: `inner.template` points to static C data (generated grammar tables).
314        unsafe { &*self.inner.template }
315    }
316
317    /// Return the human-readable node name for `tag`.
318    ///
319    /// # Panics
320    /// Panics if `tag` is out of bounds for this grammar.
321    pub fn node_name(&self, tag: AnyNodeTag) -> &'static str {
322        let raw = self.template();
323        let idx = tag.0 as usize;
324        assert!(
325            idx < raw.node_count as usize,
326            "node tag {} out of bounds (count={})",
327            idx,
328            raw.node_count,
329        );
330        // SAFETY: idx is bounds-checked above; node_names is a static array of
331        // length node_count populated by codegen, with valid NUL-terminated strings.
332        unsafe {
333            let cstr = std::ffi::CStr::from_ptr(*raw.node_names.add(idx));
334            cstr.to_str().expect("invalid UTF-8 in node name")
335        }
336    }
337
338    /// Whether `tag` identifies a list node shape.
339    pub fn is_list(&self, tag: AnyNodeTag) -> bool {
340        let raw = self.template();
341        let idx = tag.0 as usize;
342        if idx >= raw.node_count as usize {
343            return false;
344        }
345        // SAFETY: idx is bounds-checked above; list_tags is a static array of
346        // length node_count populated by codegen.
347        unsafe { *raw.list_tags.add(idx) != 0 }
348    }
349
350    /// Return field metadata for nodes with tag `tag`.
351    pub fn field_meta(&self, tag: AnyNodeTag) -> impl ExactSizeIterator<Item = FieldMeta<'static>> {
352        let raw = self.template();
353        let idx = tag.0 as usize;
354        // SAFETY: idx is bounds-checked; field_meta_counts and field_meta are
355        // parallel static arrays of length node_count populated by codegen.
356        let slice: &'static [ffi::CFieldMeta] = unsafe {
357            if idx >= raw.node_count as usize {
358                &[]
359            } else {
360                let count = *raw.field_meta_counts.add(idx) as usize;
361                let ptr = *raw.field_meta.add(idx);
362                if count == 0 || ptr.is_null() {
363                    &[]
364                } else {
365                    std::slice::from_raw_parts(ptr, count)
366                }
367            }
368        };
369        slice.iter().map(FieldMeta)
370    }
371
372    /// Classify a token for presentation/analysis using parser context when available.
373    pub fn classify_token(
374        &self,
375        token_type: AnyTokenType,
376        flags: ParserTokenFlags,
377    ) -> TokenCategory {
378        if flags.used_as_function() {
379            TokenCategory::Function
380        } else if flags.used_as_type() {
381            TokenCategory::Type
382        } else if flags.used_as_identifier() {
383            TokenCategory::Identifier
384        } else {
385            self.token_category(token_type)
386        }
387    }
388
389    /// Return the default semantic category for a token type ordinal.
390    pub fn token_category(&self, token_type: AnyTokenType) -> TokenCategory {
391        let raw = self.template();
392        let idx = token_type.0 as usize;
393        if raw.token_categories.is_null() || idx >= raw.token_type_count as usize {
394            return TokenCategory::Other;
395        }
396        // SAFETY: token_categories is null-checked; it is a static array of
397        // length token_type_count populated by codegen.
398        let byte = unsafe { *raw.token_categories.add(idx) };
399        TokenCategory::from(ffi::CTokenCategory::from_u8(byte))
400    }
401
402    /// Iterate all keywords known to this grammar.
403    ///
404    /// Yields a [`KeywordEntry`] for each keyword, containing the token type
405    /// ordinal and the keyword lexeme (e.g. `SELECT`, `WHERE`).
406    ///
407    /// The iterator implements [`ExactSizeIterator`], so `.len()` gives the
408    /// total keyword count without consuming the iterator.
409    pub fn keywords(&self) -> impl ExactSizeIterator<Item = KeywordEntry> + '_ {
410        let raw = self.template();
411        let count = if raw.keyword_text.is_null()
412            || raw.keyword_offsets.is_null()
413            || raw.keyword_lens.is_null()
414            || raw.keyword_codes.is_null()
415            || raw.keyword_count.is_null()
416        {
417            0
418        } else {
419            // SAFETY: keyword_count is null-checked above; points to a static u32.
420            unsafe { *raw.keyword_count as usize }
421        };
422        KeywordIter {
423            grammar: self,
424            idx: 0,
425            count,
426        }
427    }
428}
429
430impl TypedGrammar for AnyGrammar {
431    type Node<'a> = crate::ast::AnyNode<'a>;
432    type NodeId = crate::ast::AnyNodeId;
433    type Token = AnyTokenType;
434}
435
436/// One grammar keyword entry.
437///
438/// Yielded by [`AnyGrammar::keywords`] for completions, lexers, and tooling.
439#[derive(Debug, Clone, Copy, PartialEq, Eq)]
440pub struct KeywordEntry {
441    /// The token type for this keyword.
442    token_type: AnyTokenType,
443    /// The keyword lexeme (e.g. `"SELECT"`, `"WHERE"`).
444    keyword: &'static str,
445}
446
447impl KeywordEntry {
448    /// The token type for this keyword.
449    pub fn token_type(&self) -> AnyTokenType {
450        self.token_type
451    }
452    /// The keyword lexeme (e.g. `"SELECT"`, `"WHERE"`).
453    pub fn keyword(&self) -> &'static str {
454        self.keyword
455    }
456}
457
458struct KeywordIter<'a> {
459    grammar: &'a AnyGrammar,
460    idx: usize,
461    count: usize,
462}
463
464impl Iterator for KeywordIter<'_> {
465    type Item = KeywordEntry;
466
467    fn next(&mut self) -> Option<KeywordEntry> {
468        if self.idx >= self.count {
469            return None;
470        }
471        let raw = self.grammar.template();
472        // SAFETY: all keyword pointers were null-checked in `keywords()`; arrays
473        // are static, length = self.count, and self.idx < self.count.
474        let entry = unsafe {
475            let code = u32::from(*raw.keyword_codes.add(self.idx));
476            let len = *raw.keyword_lens.add(self.idx) as usize;
477            let off = *raw.keyword_offsets.add(self.idx) as usize;
478            let bytes = std::slice::from_raw_parts(raw.keyword_text.cast::<u8>().add(off), len);
479            KeywordEntry {
480                token_type: AnyTokenType(code),
481                keyword: std::str::from_utf8_unchecked(bytes),
482            }
483        };
484        self.idx += 1;
485        Some(entry)
486    }
487
488    fn size_hint(&self) -> (usize, Option<usize>) {
489        let remaining = self.count - self.idx;
490        (remaining, Some(remaining))
491    }
492}
493
494impl ExactSizeIterator for KeywordIter<'_> {}
495
496// ── ffi ───────────────────────────────────────────────────────────────────────
497
498pub(crate) mod ffi {
499    use crate::util::ffi::CCflags;
500
501    /// Mirrors C `SynqTokenCategory` enum defined in
502    /// `include/syntaqlite/grammar.h`.
503    #[repr(u8)]
504    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
505    pub(crate) enum CTokenCategory {
506        Other = 0,
507        Keyword = 1,
508        Identifier = 2,
509        String = 3,
510        Number = 4,
511        Operator = 5,
512        Punctuation = 6,
513        Comment = 7,
514        Variable = 8,
515        Function = 9,
516        Type = 10,
517    }
518
519    impl CTokenCategory {
520        /// Convert a raw byte from the grammar table to a `CTokenCategory`.
521        /// Unknown values map to `Other`.
522        pub(crate) fn from_u8(v: u8) -> Self {
523            match v {
524                1 => Self::Keyword,
525                2 => Self::Identifier,
526                3 => Self::String,
527                4 => Self::Number,
528                5 => Self::Operator,
529                6 => Self::Punctuation,
530                7 => Self::Comment,
531                8 => Self::Variable,
532                9 => Self::Function,
533                10 => Self::Type,
534                _ => Self::Other,
535            }
536        }
537    }
538
539    /// Mirrors C `SyntaqliteGrammarTemplate` struct defined in
540    /// `include/syntaqlite/grammar.h`.
541    #[repr(C)]
542    pub(crate) struct CGrammarTemplate {
543        pub(crate) name: *const std::ffi::c_char,
544
545        // Range metadata
546        pub(crate) range_meta: *const std::ffi::c_void,
547
548        // AST metadata
549        pub(crate) node_count: u32,
550        pub(crate) node_names: *const *const std::ffi::c_char,
551        pub(crate) field_meta: *const *const CFieldMeta,
552        pub(crate) field_meta_counts: *const u8,
553        pub(crate) list_tags: *const u8,
554
555        // Parser lifecycle (function pointers provided by grammar)
556        pub(crate) parser_alloc: *const std::ffi::c_void,
557        pub(crate) parser_init: *const std::ffi::c_void,
558        pub(crate) parser_finalize: *const std::ffi::c_void,
559        pub(crate) parser_free: *const std::ffi::c_void,
560        pub(crate) parser_feed: *const std::ffi::c_void,
561        pub(crate) parser_trace: *const std::ffi::c_void,
562        pub(crate) parser_expected_tokens: *const std::ffi::c_void,
563        pub(crate) parser_completion_context: *const std::ffi::c_void,
564
565        // Tokenizer (function pointer provided by grammar)
566        pub(crate) get_token: *const std::ffi::c_void,
567
568        // Keyword table metadata
569        pub(crate) keyword_text: *const std::ffi::c_char,
570        pub(crate) keyword_offsets: *const u16,
571        pub(crate) keyword_lens: *const u8,
572        pub(crate) keyword_codes: *const u8,
573        pub(crate) keyword_count: *const u32,
574
575        // Token metadata (indexed by token type ordinal)
576        pub(crate) token_categories: *const u8,
577        pub(crate) token_type_count: u32,
578    }
579
580    /// Mirrors C `SyntaqliteGrammar` from `include/syntaqlite/grammar.h`.
581    #[repr(C)]
582    #[derive(Debug, Clone, Copy)]
583    pub struct CGrammar {
584        pub(crate) template: *const CGrammarTemplate,
585        pub(crate) sqlite_version: i32,
586        pub(crate) cflags: CCflags,
587    }
588
589    /// Mirrors C `SyntaqliteFieldMeta` from `include/syntaqlite_dialect/dialect_types.h`.
590    #[repr(C)]
591    pub(crate) struct CFieldMeta {
592        pub(crate) offset: u16,
593        pub(crate) kind: u8,
594        pub(crate) name: *const std::ffi::c_char,
595        pub(crate) display: *const *const std::ffi::c_char,
596        pub(crate) display_count: u8,
597    }
598}