Skip to main content

syntaqlite_syntax/
grammar.rs

1// Copyright 2025 The syntaqlite Authors. All rights reserved.
2// Licensed under the Apache License, Version 2.0.
3
4// ── Public API ───────────────────────────────────────────────────────────────
5
6use crate::any::{AnyNodeTag, AnyTokenType};
7use crate::util::{SqliteSyntaxFlags, SqliteVersion};
8
9/// Runtime field-value shape used when reflecting over AST nodes.
10///
11/// This powers grammar-agnostic tooling that inspects nodes without generated
12/// Rust types.
13#[repr(u8)]
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum FieldKind {
16    /// A child node identifier.
17    NodeId = 0,
18    /// A source span (byte offset + length).
19    Span = 1,
20    /// A boolean flag.
21    Bool = 2,
22    /// A compact bitfield of flags.
23    Flags = 3,
24    /// A discriminant for an enum variant.
25    Enum = 4,
26}
27
28impl FieldKind {
29    fn from_u8(v: u8) -> Self {
30        match v {
31            1 => FieldKind::Span,
32            2 => FieldKind::Bool,
33            3 => FieldKind::Flags,
34            4 => FieldKind::Enum,
35            _ => FieldKind::NodeId,
36        }
37    }
38}
39
40/// High-level semantic class of a token.
41///
42/// Commonly used for syntax highlighting, token styling, and lightweight
43/// heuristics before full semantic analysis.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum TokenCategory {
46    /// SQL keyword (SELECT, FROM, WHERE, …)
47    Keyword,
48    /// Bind parameter or session variable (`:name`, `@var`, `?`)
49    Parameter,
50    /// String literal or blob literal
51    String,
52    /// Numeric literal
53    Number,
54    /// Operator or comparison symbol (`+`, `=`, `||`, …)
55    Operator,
56    /// Comment (`-- …` or `/* … */`)
57    Comment,
58    /// Punctuation (`,`, `(`, `)`, `;`, …)
59    Punctuation,
60    /// Quoted or unquoted identifier
61    Identifier,
62    /// Built-in or user-defined function name
63    Function,
64    /// Type name (in CAST, column definitions, …)
65    Type,
66    /// Anything that doesn't fall into the above categories
67    Other,
68}
69
70impl From<ffi::CTokenCategory> for TokenCategory {
71    fn from(c: ffi::CTokenCategory) -> Self {
72        match c {
73            ffi::CTokenCategory::Keyword => Self::Keyword,
74            ffi::CTokenCategory::Identifier => Self::Identifier,
75            ffi::CTokenCategory::String => Self::String,
76            ffi::CTokenCategory::Number => Self::Number,
77            ffi::CTokenCategory::Operator => Self::Operator,
78            ffi::CTokenCategory::Punctuation => Self::Punctuation,
79            ffi::CTokenCategory::Comment => Self::Comment,
80            ffi::CTokenCategory::Variable => Self::Parameter,
81            ffi::CTokenCategory::Function => Self::Function,
82            ffi::CTokenCategory::Type => Self::Type,
83            ffi::CTokenCategory::Other => Self::Other,
84        }
85    }
86}
87
88/// Metadata for one AST field of one node type.
89///
90/// Use this to build generic inspectors, serializers, or debug UIs that can
91/// walk arbitrary grammars.
92pub struct FieldMeta<'a>(pub(crate) &'a ffi::CFieldMeta);
93
94impl FieldMeta<'_> {
95    /// Byte offset of this field within its parent AST node struct.
96    pub fn offset(&self) -> u16 {
97        self.0.offset
98    }
99
100    /// Semantic kind of this field.
101    pub fn kind(&self) -> FieldKind {
102        FieldKind::from_u8(self.0.kind)
103    }
104
105    /// The field name as a `&str`.
106    ///
107    /// # Panics
108    /// Panics if the grammar table contains invalid UTF-8 in the field name
109    /// (which would indicate a codegen bug).
110    pub fn name(&self) -> &'static str {
111        // SAFETY: `FieldMeta` is only constructed from static grammar tables
112        // where `name` is always a valid, NUL-terminated UTF-8 C string.
113        unsafe {
114            let cstr = std::ffi::CStr::from_ptr(self.0.name);
115            cstr.to_str().expect("invalid UTF-8 in field name")
116        }
117    }
118
119    /// The `idx`-th display name for enum variants, if present.
120    ///
121    /// # Panics
122    /// Panics if the grammar table contains invalid UTF-8 in a display name
123    /// (which would indicate a codegen bug).
124    pub fn display_name(&self, idx: usize) -> Option<&'static str> {
125        if self.0.display.is_null() || idx >= self.0.display_count as usize {
126            return None;
127        }
128        // SAFETY: `FieldMeta` is only constructed from static grammar tables;
129        // `display` and its entries are valid static C strings.
130        unsafe {
131            let ptr = *self.0.display.add(idx);
132            if ptr.is_null() {
133                return None;
134            }
135            let cstr = std::ffi::CStr::from_ptr(ptr);
136            Some(cstr.to_str().expect("invalid UTF-8 in display name"))
137        }
138    }
139
140    /// Number of display names for this field.
141    pub fn display_count(&self) -> usize {
142        self.0.display_count as usize
143    }
144}
145
146/// Parser-inferred semantic usage for an individual token occurrence.
147///
148/// This complements lexical token kind and helps distinguish ambiguous tokens
149/// (for example keyword text used as an identifier).
150#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
151pub struct ParserTokenFlags(u8);
152
153impl ParserTokenFlags {
154    /// Construct from a raw C flag bitfield (`SyntaqliteParserTokenFlags = uint32_t`).
155    pub(crate) fn from_raw(v: u32) -> Self {
156        let bits = u8::try_from(v).expect("parser token flags out of range for u8");
157        ParserTokenFlags(bits)
158    }
159
160    // Bit positions — mirror C SYNQ_TOKEN_FLAG_* in syntaqlite/parser.h.
161    const AS_ID: u8 = 1;
162    const AS_FUNCTION: u8 = 2;
163    const AS_TYPE: u8 = 4;
164
165    /// Returns the underlying flag bits.
166    pub fn bits(self) -> u8 {
167        self.0
168    }
169
170    /// True if the token was used as an identifier (`SYNQ_TOKEN_FLAG_AS_ID`).
171    pub fn used_as_identifier(self) -> bool {
172        self.0 & Self::AS_ID != 0
173    }
174
175    /// True if the token was used as a function name (`SYNQ_TOKEN_FLAG_AS_FUNCTION`).
176    pub fn used_as_function(self) -> bool {
177        self.0 & Self::AS_FUNCTION != 0
178    }
179
180    /// True if the token was used as a type name (`SYNQ_TOKEN_FLAG_AS_TYPE`).
181    pub fn used_as_type(self) -> bool {
182        self.0 & Self::AS_TYPE != 0
183    }
184}
185
186/// Trait implemented by generated grammar handles.
187///
188/// End users typically consume implementations rather than writing them.
189/// The trait links a grammar to its typed node and token enums.
190pub trait TypedGrammar: Clone + Into<AnyGrammar> {
191    /// The top-level typed AST node enum for this grammar.
192    type Node<'a>: crate::ast::GrammarNodeType<'a>;
193    /// The grammar's typed node ID, wrapping an [`crate::ast::AnyNodeId`].
194    ///
195    /// Used as the return type of [`TypedNodeList::node_id`](crate::ast::TypedNodeList::node_id)
196    /// so callers get a grammar-typed handle rather than a raw [`crate::ast::AnyNodeId`].
197    type NodeId: Copy + From<crate::ast::AnyNodeId> + Into<crate::ast::AnyNodeId>;
198    /// The typed token enum for this grammar.
199    type Token: crate::ast::GrammarTokenType;
200}
201
202/// Grammar handle for runtime-configurable, grammar-agnostic workflows.
203///
204/// Use `AnyGrammar` when grammar selection/configuration is dynamic (plugins,
205/// LSP hosts, multi-grammar test harnesses). It carries version/cflag knobs
206/// and introspection metadata, while remaining cheap to clone.
207///
208/// Built-in grammars hold `&'static` C data directly. Dynamically loaded
209/// grammars transmute library-memory pointers to `&'static` and keep the
210/// library alive via an [`Arc`](std::sync::Arc). Use `AnyGrammar::load` to create one.
211#[derive(Clone)]
212pub struct AnyGrammar {
213    pub(crate) inner: ffi::CGrammar,
214    /// Keeps the shared library alive for dynamically-loaded grammars.
215    /// `None` for built-in (static) grammars.
216    _keep_alive: Option<std::sync::Arc<dyn Send + Sync>>,
217}
218
219// SAFETY: The grammar wraps an immutable reference to static C data.
220unsafe impl Send for AnyGrammar {}
221// SAFETY: AnyGrammar wraps a *const CGrammar to a static C grammar object; it is safe to share across threads.
222unsafe impl Sync for AnyGrammar {}
223
224impl AnyGrammar {
225    /// Construct a `AnyGrammar` from a raw C grammar value.\
226    ///
227    /// This unsafe method exists only for use by grammar implementations which are code generated.
228    /// End users should never need to call this directly.
229    ///
230    /// # Safety
231    /// The `template` pointer inside `inner` must point to valid, `'static`
232    /// C grammar tables (e.g. returned by a grammar's `extern "C"` grammar
233    /// accessor such as `syntaqlite_sqlite_grammar()`).
234    pub unsafe fn new(inner: ffi::CGrammar) -> Self {
235        AnyGrammar {
236            inner,
237            _keep_alive: None,
238        }
239    }
240
241    /// Load a grammar from a shared library (`.so` / `.dylib` / `.dll`).
242    ///
243    /// Resolves `syntaqlite_<name>_grammar` (or `syntaqlite_grammar` when `name`
244    /// is `None`) and calls it to obtain the grammar handle.
245    ///
246    /// # Errors
247    /// Returns `Err` if the library cannot be opened or the grammar symbol is absent.
248    ///
249    /// # Library lifetime
250    /// The loaded library is kept alive via an [`Arc`] stored inside the
251    /// returned `AnyGrammar`. Dropping the last clone of the grammar unloads
252    /// the library. Use [`syntaqlite::Dialect::load`] for dialect-level loading.
253    #[cfg(feature = "dynload")]
254    pub fn load(path: &str, name: Option<&str>) -> Result<Self, String> {
255        // SAFETY: We keep `lib` alive in an `Arc` below so the grammar pointer
256        // lives as long as any clone of the returned AnyGrammar.
257        let lib = unsafe {
258            libloading::Library::new(path).map_err(|e| format!("failed to load {path:?}: {e}"))?
259        };
260
261        let symbol = match name {
262            Some(n) => format!("syntaqlite_{n}_grammar"),
263            None => "syntaqlite_grammar".to_string(),
264        };
265        // SAFETY: We call the function immediately and drop `func` before `lib`
266        // is moved into the Arc, so there is no lifetime overlap issue.
267        let raw: ffi::CGrammar = unsafe {
268            let func: libloading::Symbol<'_, unsafe extern "C" fn() -> ffi::CGrammar> = lib
269                .get(symbol.as_bytes())
270                .map_err(|e| format!("symbol {symbol:?} not found in {path:?}: {e}"))?;
271            func()
272        };
273
274        let keep_alive: std::sync::Arc<dyn Send + Sync> = std::sync::Arc::new(lib);
275
276        // SAFETY: `raw.template` points into the shared library kept alive by
277        // `keep_alive`. Dropping the last AnyGrammar clone unloads the library.
278        Ok(AnyGrammar {
279            inner: raw,
280            _keep_alive: Some(keep_alive),
281        })
282    }
283
284    /// Pin this grammar handle to a target `SQLite` version.
285    ///
286    /// Useful when your product must emulate a specific engine release.
287    #[must_use]
288    pub fn with_version(mut self, version: SqliteVersion) -> Self {
289        self.inner.sqlite_version = version.as_int();
290        self
291    }
292
293    /// Replace compile-time compatibility flags on this handle.
294    #[must_use]
295    pub fn with_cflags(mut self, flags: SqliteSyntaxFlags) -> Self {
296        self.inner.cflags = flags.0;
297        self
298    }
299
300    /// Target `SQLite` version currently configured on this handle.
301    pub fn version(&self) -> SqliteVersion {
302        SqliteVersion::from_int(self.inner.sqlite_version)
303    }
304
305    /// Active C-parser compile-time compatibility flags.
306    pub fn cflags(&self) -> SqliteSyntaxFlags {
307        SqliteSyntaxFlags(self.inner.cflags)
308    }
309
310    /// Whether this grammar supports Rust-style macro invocations (`name!(args)`).
311    pub fn has_macro_style(&self) -> bool {
312        self.template().macro_style != 0
313    }
314
315    /// Return a reference to the abstract grammar template.
316    #[inline]
317    fn template(&self) -> &'static ffi::CGrammarTemplate {
318        // SAFETY: `inner.template` points to static C data (generated grammar tables).
319        unsafe { &*self.inner.template }
320    }
321
322    /// Return the human-readable node name for `tag`.
323    ///
324    /// # Panics
325    /// Panics if `tag` is out of bounds for this grammar.
326    pub fn node_name(&self, tag: AnyNodeTag) -> &'static str {
327        let raw = self.template();
328        let idx = tag.0 as usize;
329        assert!(
330            idx < raw.node_count as usize,
331            "node tag {} out of bounds (count={})",
332            idx,
333            raw.node_count,
334        );
335        // SAFETY: idx is bounds-checked above; node_names is a static array of
336        // length node_count populated by codegen, with valid NUL-terminated strings.
337        unsafe {
338            let cstr = std::ffi::CStr::from_ptr(*raw.node_names.add(idx));
339            cstr.to_str().expect("invalid UTF-8 in node name")
340        }
341    }
342
343    /// Whether `tag` identifies a list node shape.
344    pub fn is_list(&self, tag: AnyNodeTag) -> bool {
345        let raw = self.template();
346        let idx = tag.0 as usize;
347        if idx >= raw.node_count as usize {
348            return false;
349        }
350        // SAFETY: idx is bounds-checked above; list_tags is a static array of
351        // length node_count populated by codegen.
352        unsafe { *raw.list_tags.add(idx) != 0 }
353    }
354
355    /// Return field metadata for nodes with tag `tag`.
356    pub fn field_meta(&self, tag: AnyNodeTag) -> impl ExactSizeIterator<Item = FieldMeta<'static>> {
357        let raw = self.template();
358        let idx = tag.0 as usize;
359        // SAFETY: idx is bounds-checked; field_meta_counts and field_meta are
360        // parallel static arrays of length node_count populated by codegen.
361        let slice: &'static [ffi::CFieldMeta] = unsafe {
362            if idx >= raw.node_count as usize {
363                &[]
364            } else {
365                let count = *raw.field_meta_counts.add(idx) as usize;
366                let ptr = *raw.field_meta.add(idx);
367                if count == 0 || ptr.is_null() {
368                    &[]
369                } else {
370                    std::slice::from_raw_parts(ptr, count)
371                }
372            }
373        };
374        slice.iter().map(FieldMeta)
375    }
376
377    /// Classify a token for presentation/analysis using parser context when available.
378    pub fn classify_token(
379        &self,
380        token_type: AnyTokenType,
381        flags: ParserTokenFlags,
382    ) -> TokenCategory {
383        if flags.used_as_function() {
384            TokenCategory::Function
385        } else if flags.used_as_type() {
386            TokenCategory::Type
387        } else if flags.used_as_identifier() {
388            TokenCategory::Identifier
389        } else {
390            self.token_category(token_type)
391        }
392    }
393
394    /// Return the default semantic category for a token type ordinal.
395    pub fn token_category(&self, token_type: AnyTokenType) -> TokenCategory {
396        let raw = self.template();
397        let idx = token_type.0 as usize;
398        if raw.token_categories.is_null() || idx >= raw.token_type_count as usize {
399            return TokenCategory::Other;
400        }
401        // SAFETY: token_categories is null-checked; it is a static array of
402        // length token_type_count populated by codegen.
403        let byte = unsafe { *raw.token_categories.add(idx) };
404        TokenCategory::from(ffi::CTokenCategory::from_u8(byte))
405    }
406
407    /// Iterate all keywords known to this grammar.
408    ///
409    /// Yields a [`KeywordEntry`] for each keyword, containing the token type
410    /// ordinal and the keyword lexeme (e.g. `SELECT`, `WHERE`).
411    ///
412    /// The iterator implements [`ExactSizeIterator`], so `.len()` gives the
413    /// total keyword count without consuming the iterator.
414    pub fn keywords(&self) -> impl ExactSizeIterator<Item = KeywordEntry> + '_ {
415        let raw = self.template();
416        let count = if raw.keyword_text.is_null()
417            || raw.keyword_offsets.is_null()
418            || raw.keyword_lens.is_null()
419            || raw.keyword_codes.is_null()
420            || raw.keyword_count.is_null()
421        {
422            0
423        } else {
424            // SAFETY: keyword_count is null-checked above; points to a static u32.
425            unsafe { *raw.keyword_count as usize }
426        };
427        KeywordIter {
428            grammar: self,
429            idx: 0,
430            count,
431        }
432    }
433}
434
435impl TypedGrammar for AnyGrammar {
436    type Node<'a> = crate::ast::AnyNode<'a>;
437    type NodeId = crate::ast::AnyNodeId;
438    type Token = AnyTokenType;
439}
440
441/// One grammar keyword entry.
442///
443/// Yielded by [`AnyGrammar::keywords`] for completions, lexers, and tooling.
444#[derive(Debug, Clone, Copy, PartialEq, Eq)]
445pub struct KeywordEntry {
446    /// The token type for this keyword.
447    token_type: AnyTokenType,
448    /// The keyword lexeme (e.g. `"SELECT"`, `"WHERE"`).
449    keyword: &'static str,
450}
451
452impl KeywordEntry {
453    /// The token type for this keyword.
454    pub fn token_type(&self) -> AnyTokenType {
455        self.token_type
456    }
457    /// The keyword lexeme (e.g. `"SELECT"`, `"WHERE"`).
458    pub fn keyword(&self) -> &'static str {
459        self.keyword
460    }
461}
462
463struct KeywordIter<'a> {
464    grammar: &'a AnyGrammar,
465    idx: usize,
466    count: usize,
467}
468
469impl Iterator for KeywordIter<'_> {
470    type Item = KeywordEntry;
471
472    fn next(&mut self) -> Option<KeywordEntry> {
473        if self.idx >= self.count {
474            return None;
475        }
476        let raw = self.grammar.template();
477        // SAFETY: all keyword pointers were null-checked in `keywords()`; arrays
478        // are static, length = self.count, and self.idx < self.count.
479        let entry = unsafe {
480            let code = u32::from(*raw.keyword_codes.add(self.idx));
481            let len = *raw.keyword_lens.add(self.idx) as usize;
482            let off = *raw.keyword_offsets.add(self.idx) as usize;
483            let bytes = std::slice::from_raw_parts(raw.keyword_text.cast::<u8>().add(off), len);
484            KeywordEntry {
485                token_type: AnyTokenType(code),
486                keyword: std::str::from_utf8_unchecked(bytes),
487            }
488        };
489        self.idx += 1;
490        Some(entry)
491    }
492
493    fn size_hint(&self) -> (usize, Option<usize>) {
494        let remaining = self.count - self.idx;
495        (remaining, Some(remaining))
496    }
497}
498
499impl ExactSizeIterator for KeywordIter<'_> {}
500
501// ── ffi ───────────────────────────────────────────────────────────────────────
502
503pub(crate) mod ffi {
504    use crate::util::ffi::CCflags;
505
506    /// Mirrors C `SynqTokenCategory` enum defined in
507    /// `include/syntaqlite/grammar.h`.
508    #[repr(u8)]
509    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
510    pub(crate) enum CTokenCategory {
511        Other = 0,
512        Keyword = 1,
513        Identifier = 2,
514        String = 3,
515        Number = 4,
516        Operator = 5,
517        Punctuation = 6,
518        Comment = 7,
519        Variable = 8,
520        Function = 9,
521        Type = 10,
522    }
523
524    impl CTokenCategory {
525        /// Convert a raw byte from the grammar table to a `CTokenCategory`.
526        /// Unknown values map to `Other`.
527        pub(crate) fn from_u8(v: u8) -> Self {
528            match v {
529                1 => Self::Keyword,
530                2 => Self::Identifier,
531                3 => Self::String,
532                4 => Self::Number,
533                5 => Self::Operator,
534                6 => Self::Punctuation,
535                7 => Self::Comment,
536                8 => Self::Variable,
537                9 => Self::Function,
538                10 => Self::Type,
539                _ => Self::Other,
540            }
541        }
542    }
543
544    /// Mirrors C `SyntaqliteGrammarTemplate` struct defined in
545    /// `include/syntaqlite/grammar.h`.
546    #[repr(C)]
547    pub(crate) struct CGrammarTemplate {
548        pub(crate) name: *const std::ffi::c_char,
549
550        // Range metadata
551        pub(crate) range_meta: *const std::ffi::c_void,
552
553        // AST metadata
554        pub(crate) node_count: u32,
555        pub(crate) node_names: *const *const std::ffi::c_char,
556        pub(crate) field_meta: *const *const CFieldMeta,
557        pub(crate) field_meta_counts: *const u8,
558        pub(crate) list_tags: *const u8,
559
560        // Parser lifecycle (function pointers provided by grammar)
561        pub(crate) parser_alloc: *const std::ffi::c_void,
562        pub(crate) parser_init: *const std::ffi::c_void,
563        pub(crate) parser_finalize: *const std::ffi::c_void,
564        pub(crate) parser_free: *const std::ffi::c_void,
565        pub(crate) parser_feed: *const std::ffi::c_void,
566        pub(crate) parser_trace: *const std::ffi::c_void,
567        pub(crate) parser_expected_tokens: *const std::ffi::c_void,
568        pub(crate) parser_completion_context: *const std::ffi::c_void,
569
570        // Tokenizer (function pointer provided by grammar)
571        pub(crate) get_token: *const std::ffi::c_void,
572
573        // Keyword table metadata
574        pub(crate) keyword_text: *const std::ffi::c_char,
575        pub(crate) keyword_offsets: *const u16,
576        pub(crate) keyword_lens: *const u8,
577        pub(crate) keyword_codes: *const u8,
578        pub(crate) keyword_count: *const u32,
579
580        // Token metadata (indexed by token type ordinal)
581        pub(crate) token_categories: *const u8,
582        pub(crate) token_type_count: u32,
583
584        // Macro invocation style
585        pub(crate) macro_style: u32,
586    }
587
588    /// Mirrors C `SyntaqliteGrammar` from `include/syntaqlite/grammar.h`.
589    #[repr(C)]
590    #[derive(Debug, Clone, Copy)]
591    pub struct CGrammar {
592        pub(crate) template: *const CGrammarTemplate,
593        pub(crate) sqlite_version: i32,
594        pub(crate) cflags: CCflags,
595    }
596
597    /// Mirrors C `SyntaqliteFieldMeta` from `include/syntaqlite_dialect/dialect_types.h`.
598    #[repr(C)]
599    pub(crate) struct CFieldMeta {
600        pub(crate) offset: u16,
601        pub(crate) kind: u8,
602        pub(crate) name: *const std::ffi::c_char,
603        pub(crate) display: *const *const std::ffi::c_char,
604        pub(crate) display_count: u8,
605    }
606}