syntaqlite_syntax/grammar.rs
1// Copyright 2025 The syntaqlite Authors. All rights reserved.
2// Licensed under the Apache License, Version 2.0.
3
4// ── Public API ───────────────────────────────────────────────────────────────
5
6use crate::any::{AnyNodeTag, AnyTokenType};
7use crate::util::{SqliteSyntaxFlags, SqliteVersion};
8
9/// Runtime field-value shape used when reflecting over AST nodes.
10///
11/// This powers grammar-agnostic tooling that inspects nodes without generated
12/// Rust types.
13#[repr(u8)]
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum FieldKind {
16 /// A child node identifier.
17 NodeId = 0,
18 /// A source span (byte offset + length).
19 Span = 1,
20 /// A boolean flag.
21 Bool = 2,
22 /// A compact bitfield of flags.
23 Flags = 3,
24 /// A discriminant for an enum variant.
25 Enum = 4,
26}
27
28impl FieldKind {
29 fn from_u8(v: u8) -> Self {
30 match v {
31 1 => FieldKind::Span,
32 2 => FieldKind::Bool,
33 3 => FieldKind::Flags,
34 4 => FieldKind::Enum,
35 _ => FieldKind::NodeId,
36 }
37 }
38}
39
40/// High-level semantic class of a token.
41///
42/// Commonly used for syntax highlighting, token styling, and lightweight
43/// heuristics before full semantic analysis.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum TokenCategory {
46 /// SQL keyword (SELECT, FROM, WHERE, …)
47 Keyword,
48 /// Bind parameter or session variable (`:name`, `@var`, `?`)
49 Parameter,
50 /// String literal or blob literal
51 String,
52 /// Numeric literal
53 Number,
54 /// Operator or comparison symbol (`+`, `=`, `||`, …)
55 Operator,
56 /// Comment (`-- …` or `/* … */`)
57 Comment,
58 /// Punctuation (`,`, `(`, `)`, `;`, …)
59 Punctuation,
60 /// Quoted or unquoted identifier
61 Identifier,
62 /// Built-in or user-defined function name
63 Function,
64 /// Type name (in CAST, column definitions, …)
65 Type,
66 /// Anything that doesn't fall into the above categories
67 Other,
68}
69
70impl From<ffi::CTokenCategory> for TokenCategory {
71 fn from(c: ffi::CTokenCategory) -> Self {
72 match c {
73 ffi::CTokenCategory::Keyword => Self::Keyword,
74 ffi::CTokenCategory::Identifier => Self::Identifier,
75 ffi::CTokenCategory::String => Self::String,
76 ffi::CTokenCategory::Number => Self::Number,
77 ffi::CTokenCategory::Operator => Self::Operator,
78 ffi::CTokenCategory::Punctuation => Self::Punctuation,
79 ffi::CTokenCategory::Comment => Self::Comment,
80 ffi::CTokenCategory::Variable => Self::Parameter,
81 ffi::CTokenCategory::Function => Self::Function,
82 ffi::CTokenCategory::Type => Self::Type,
83 ffi::CTokenCategory::Other => Self::Other,
84 }
85 }
86}
87
88/// Metadata for one AST field of one node type.
89///
90/// Use this to build generic inspectors, serializers, or debug UIs that can
91/// walk arbitrary grammars.
92pub struct FieldMeta<'a>(pub(crate) &'a ffi::CFieldMeta);
93
94impl FieldMeta<'_> {
95 /// Byte offset of this field within its parent AST node struct.
96 pub fn offset(&self) -> u16 {
97 self.0.offset
98 }
99
100 /// Semantic kind of this field.
101 pub fn kind(&self) -> FieldKind {
102 FieldKind::from_u8(self.0.kind)
103 }
104
105 /// The field name as a `&str`.
106 ///
107 /// # Panics
108 /// Panics if the grammar table contains invalid UTF-8 in the field name
109 /// (which would indicate a codegen bug).
110 pub fn name(&self) -> &'static str {
111 // SAFETY: `FieldMeta` is only constructed from static grammar tables
112 // where `name` is always a valid, NUL-terminated UTF-8 C string.
113 unsafe {
114 let cstr = std::ffi::CStr::from_ptr(self.0.name);
115 cstr.to_str().expect("invalid UTF-8 in field name")
116 }
117 }
118
119 /// The `idx`-th display name for enum variants, if present.
120 ///
121 /// # Panics
122 /// Panics if the grammar table contains invalid UTF-8 in a display name
123 /// (which would indicate a codegen bug).
124 pub fn display_name(&self, idx: usize) -> Option<&'static str> {
125 if self.0.display.is_null() || idx >= self.0.display_count as usize {
126 return None;
127 }
128 // SAFETY: `FieldMeta` is only constructed from static grammar tables;
129 // `display` and its entries are valid static C strings.
130 unsafe {
131 let ptr = *self.0.display.add(idx);
132 if ptr.is_null() {
133 return None;
134 }
135 let cstr = std::ffi::CStr::from_ptr(ptr);
136 Some(cstr.to_str().expect("invalid UTF-8 in display name"))
137 }
138 }
139
140 /// Number of display names for this field.
141 pub fn display_count(&self) -> usize {
142 self.0.display_count as usize
143 }
144}
145
146/// Parser-inferred semantic usage for an individual token occurrence.
147///
148/// This complements lexical token kind and helps distinguish ambiguous tokens
149/// (for example keyword text used as an identifier).
150#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
151pub struct ParserTokenFlags(u8);
152
153impl ParserTokenFlags {
154 /// Construct from a raw C flag bitfield (`SyntaqliteParserTokenFlags = uint32_t`).
155 pub(crate) fn from_raw(v: u32) -> Self {
156 let bits = u8::try_from(v).expect("parser token flags out of range for u8");
157 ParserTokenFlags(bits)
158 }
159
160 // Bit positions — mirror C SYNQ_TOKEN_FLAG_* in syntaqlite/parser.h.
161 const AS_ID: u8 = 1;
162 const AS_FUNCTION: u8 = 2;
163 const AS_TYPE: u8 = 4;
164
165 /// Returns the underlying flag bits.
166 pub fn bits(self) -> u8 {
167 self.0
168 }
169
170 /// True if the token was used as an identifier (`SYNQ_TOKEN_FLAG_AS_ID`).
171 pub fn used_as_identifier(self) -> bool {
172 self.0 & Self::AS_ID != 0
173 }
174
175 /// True if the token was used as a function name (`SYNQ_TOKEN_FLAG_AS_FUNCTION`).
176 pub fn used_as_function(self) -> bool {
177 self.0 & Self::AS_FUNCTION != 0
178 }
179
180 /// True if the token was used as a type name (`SYNQ_TOKEN_FLAG_AS_TYPE`).
181 pub fn used_as_type(self) -> bool {
182 self.0 & Self::AS_TYPE != 0
183 }
184}
185
186/// Trait implemented by generated grammar handles.
187///
188/// End users typically consume implementations rather than writing them.
189/// The trait links a grammar to its typed node and token enums.
190pub trait TypedGrammar: Clone + Into<AnyGrammar> {
191 /// The top-level typed AST node enum for this grammar.
192 type Node<'a>: crate::ast::GrammarNodeType<'a>;
193 /// The grammar's typed node ID, wrapping an [`crate::ast::AnyNodeId`].
194 ///
195 /// Used as the return type of [`TypedNodeList::node_id`](crate::ast::TypedNodeList::node_id)
196 /// so callers get a grammar-typed handle rather than a raw [`crate::ast::AnyNodeId`].
197 type NodeId: Copy + From<crate::ast::AnyNodeId> + Into<crate::ast::AnyNodeId>;
198 /// The typed token enum for this grammar.
199 type Token: crate::ast::GrammarTokenType;
200}
201
202/// Grammar handle for runtime-configurable, grammar-agnostic workflows.
203///
204/// Use `AnyGrammar` when grammar selection/configuration is dynamic (plugins,
205/// LSP hosts, multi-grammar test harnesses). It carries version/cflag knobs
206/// and introspection metadata, while remaining cheap to clone.
207///
208/// Built-in grammars hold `&'static` C data directly. Dynamically loaded
209/// grammars transmute library-memory pointers to `&'static` and keep the
210/// library alive via an [`Arc`](std::sync::Arc). Use `AnyGrammar::load` to create one.
211#[derive(Clone)]
212pub struct AnyGrammar {
213 pub(crate) inner: ffi::CGrammar,
214 /// Keeps the shared library alive for dynamically-loaded grammars.
215 /// `None` for built-in (static) grammars.
216 _keep_alive: Option<std::sync::Arc<dyn Send + Sync>>,
217}
218
219// SAFETY: The grammar wraps an immutable reference to static C data.
220unsafe impl Send for AnyGrammar {}
221// SAFETY: AnyGrammar wraps a *const CGrammar to a static C grammar object; it is safe to share across threads.
222unsafe impl Sync for AnyGrammar {}
223
224impl AnyGrammar {
225 /// Construct a `AnyGrammar` from a raw C grammar value.\
226 ///
227 /// This unsafe method exists only for use by grammar implementations which are code generated.
228 /// End users should never need to call this directly.
229 ///
230 /// # Safety
231 /// The `template` pointer inside `inner` must point to valid, `'static`
232 /// C grammar tables (e.g. returned by a grammar's `extern "C"` grammar
233 /// accessor such as `syntaqlite_sqlite_grammar()`).
234 pub unsafe fn new(inner: ffi::CGrammar) -> Self {
235 AnyGrammar {
236 inner,
237 _keep_alive: None,
238 }
239 }
240
241 /// Load a grammar from a shared library (`.so` / `.dylib` / `.dll`).
242 ///
243 /// Resolves `syntaqlite_<name>_grammar` (or `syntaqlite_grammar` when `name`
244 /// is `None`) and calls it to obtain the grammar handle.
245 ///
246 /// # Errors
247 /// Returns `Err` if the library cannot be opened or the grammar symbol is absent.
248 ///
249 /// # Library lifetime
250 /// The loaded library is kept alive via an [`Arc`] stored inside the
251 /// returned `AnyGrammar`. Dropping the last clone of the grammar unloads
252 /// the library. Use [`syntaqlite::Dialect::load`] for dialect-level loading.
253 #[cfg(feature = "dynload")]
254 pub fn load(path: &str, name: Option<&str>) -> Result<Self, String> {
255 // SAFETY: We keep `lib` alive in an `Arc` below so the grammar pointer
256 // lives as long as any clone of the returned AnyGrammar.
257 let lib = unsafe {
258 libloading::Library::new(path).map_err(|e| format!("failed to load {path:?}: {e}"))?
259 };
260
261 let symbol = match name {
262 Some(n) => format!("syntaqlite_{n}_grammar"),
263 None => "syntaqlite_grammar".to_string(),
264 };
265 // SAFETY: We call the function immediately and drop `func` before `lib`
266 // is moved into the Arc, so there is no lifetime overlap issue.
267 let raw: ffi::CGrammar = unsafe {
268 let func: libloading::Symbol<'_, unsafe extern "C" fn() -> ffi::CGrammar> = lib
269 .get(symbol.as_bytes())
270 .map_err(|e| format!("symbol {symbol:?} not found in {path:?}: {e}"))?;
271 func()
272 };
273
274 let keep_alive: std::sync::Arc<dyn Send + Sync> = std::sync::Arc::new(lib);
275
276 // SAFETY: `raw.template` points into the shared library kept alive by
277 // `keep_alive`. Dropping the last AnyGrammar clone unloads the library.
278 Ok(AnyGrammar {
279 inner: raw,
280 _keep_alive: Some(keep_alive),
281 })
282 }
283
284 /// Pin this grammar handle to a target `SQLite` version.
285 ///
286 /// Useful when your product must emulate a specific engine release.
287 #[must_use]
288 pub fn with_version(mut self, version: SqliteVersion) -> Self {
289 self.inner.sqlite_version = version.as_int();
290 self
291 }
292
293 /// Replace compile-time compatibility flags on this handle.
294 #[must_use]
295 pub fn with_cflags(mut self, flags: SqliteSyntaxFlags) -> Self {
296 self.inner.cflags = flags.0;
297 self
298 }
299
300 /// Target `SQLite` version currently configured on this handle.
301 pub fn version(&self) -> SqliteVersion {
302 SqliteVersion::from_int(self.inner.sqlite_version)
303 }
304
305 /// Active C-parser compile-time compatibility flags.
306 pub fn cflags(&self) -> SqliteSyntaxFlags {
307 SqliteSyntaxFlags(self.inner.cflags)
308 }
309
310 /// Whether this grammar supports Rust-style macro invocations (`name!(args)`).
311 pub fn has_macro_style(&self) -> bool {
312 self.template().macro_style != 0
313 }
314
315 /// Return a reference to the abstract grammar template.
316 #[inline]
317 fn template(&self) -> &'static ffi::CGrammarTemplate {
318 // SAFETY: `inner.template` points to static C data (generated grammar tables).
319 unsafe { &*self.inner.template }
320 }
321
322 /// Return the human-readable node name for `tag`.
323 ///
324 /// # Panics
325 /// Panics if `tag` is out of bounds for this grammar.
326 pub fn node_name(&self, tag: AnyNodeTag) -> &'static str {
327 let raw = self.template();
328 let idx = tag.0 as usize;
329 assert!(
330 idx < raw.node_count as usize,
331 "node tag {} out of bounds (count={})",
332 idx,
333 raw.node_count,
334 );
335 // SAFETY: idx is bounds-checked above; node_names is a static array of
336 // length node_count populated by codegen, with valid NUL-terminated strings.
337 unsafe {
338 let cstr = std::ffi::CStr::from_ptr(*raw.node_names.add(idx));
339 cstr.to_str().expect("invalid UTF-8 in node name")
340 }
341 }
342
343 /// Whether `tag` identifies a list node shape.
344 pub fn is_list(&self, tag: AnyNodeTag) -> bool {
345 let raw = self.template();
346 let idx = tag.0 as usize;
347 if idx >= raw.node_count as usize {
348 return false;
349 }
350 // SAFETY: idx is bounds-checked above; list_tags is a static array of
351 // length node_count populated by codegen.
352 unsafe { *raw.list_tags.add(idx) != 0 }
353 }
354
355 /// Return field metadata for nodes with tag `tag`.
356 pub fn field_meta(&self, tag: AnyNodeTag) -> impl ExactSizeIterator<Item = FieldMeta<'static>> {
357 let raw = self.template();
358 let idx = tag.0 as usize;
359 // SAFETY: idx is bounds-checked; field_meta_counts and field_meta are
360 // parallel static arrays of length node_count populated by codegen.
361 let slice: &'static [ffi::CFieldMeta] = unsafe {
362 if idx >= raw.node_count as usize {
363 &[]
364 } else {
365 let count = *raw.field_meta_counts.add(idx) as usize;
366 let ptr = *raw.field_meta.add(idx);
367 if count == 0 || ptr.is_null() {
368 &[]
369 } else {
370 std::slice::from_raw_parts(ptr, count)
371 }
372 }
373 };
374 slice.iter().map(FieldMeta)
375 }
376
377 /// Classify a token for presentation/analysis using parser context when available.
378 pub fn classify_token(
379 &self,
380 token_type: AnyTokenType,
381 flags: ParserTokenFlags,
382 ) -> TokenCategory {
383 if flags.used_as_function() {
384 TokenCategory::Function
385 } else if flags.used_as_type() {
386 TokenCategory::Type
387 } else if flags.used_as_identifier() {
388 TokenCategory::Identifier
389 } else {
390 self.token_category(token_type)
391 }
392 }
393
394 /// Return the default semantic category for a token type ordinal.
395 pub fn token_category(&self, token_type: AnyTokenType) -> TokenCategory {
396 let raw = self.template();
397 let idx = token_type.0 as usize;
398 if raw.token_categories.is_null() || idx >= raw.token_type_count as usize {
399 return TokenCategory::Other;
400 }
401 // SAFETY: token_categories is null-checked; it is a static array of
402 // length token_type_count populated by codegen.
403 let byte = unsafe { *raw.token_categories.add(idx) };
404 TokenCategory::from(ffi::CTokenCategory::from_u8(byte))
405 }
406
407 /// Iterate all keywords known to this grammar.
408 ///
409 /// Yields a [`KeywordEntry`] for each keyword, containing the token type
410 /// ordinal and the keyword lexeme (e.g. `SELECT`, `WHERE`).
411 ///
412 /// The iterator implements [`ExactSizeIterator`], so `.len()` gives the
413 /// total keyword count without consuming the iterator.
414 pub fn keywords(&self) -> impl ExactSizeIterator<Item = KeywordEntry> + '_ {
415 let raw = self.template();
416 let count = if raw.keyword_text.is_null()
417 || raw.keyword_offsets.is_null()
418 || raw.keyword_lens.is_null()
419 || raw.keyword_codes.is_null()
420 || raw.keyword_count.is_null()
421 {
422 0
423 } else {
424 // SAFETY: keyword_count is null-checked above; points to a static u32.
425 unsafe { *raw.keyword_count as usize }
426 };
427 KeywordIter {
428 grammar: self,
429 idx: 0,
430 count,
431 }
432 }
433}
434
435impl TypedGrammar for AnyGrammar {
436 type Node<'a> = crate::ast::AnyNode<'a>;
437 type NodeId = crate::ast::AnyNodeId;
438 type Token = AnyTokenType;
439}
440
441/// One grammar keyword entry.
442///
443/// Yielded by [`AnyGrammar::keywords`] for completions, lexers, and tooling.
444#[derive(Debug, Clone, Copy, PartialEq, Eq)]
445pub struct KeywordEntry {
446 /// The token type for this keyword.
447 token_type: AnyTokenType,
448 /// The keyword lexeme (e.g. `"SELECT"`, `"WHERE"`).
449 keyword: &'static str,
450}
451
452impl KeywordEntry {
453 /// The token type for this keyword.
454 pub fn token_type(&self) -> AnyTokenType {
455 self.token_type
456 }
457 /// The keyword lexeme (e.g. `"SELECT"`, `"WHERE"`).
458 pub fn keyword(&self) -> &'static str {
459 self.keyword
460 }
461}
462
463struct KeywordIter<'a> {
464 grammar: &'a AnyGrammar,
465 idx: usize,
466 count: usize,
467}
468
469impl Iterator for KeywordIter<'_> {
470 type Item = KeywordEntry;
471
472 fn next(&mut self) -> Option<KeywordEntry> {
473 if self.idx >= self.count {
474 return None;
475 }
476 let raw = self.grammar.template();
477 // SAFETY: all keyword pointers were null-checked in `keywords()`; arrays
478 // are static, length = self.count, and self.idx < self.count.
479 let entry = unsafe {
480 let code = u32::from(*raw.keyword_codes.add(self.idx));
481 let len = *raw.keyword_lens.add(self.idx) as usize;
482 let off = *raw.keyword_offsets.add(self.idx) as usize;
483 let bytes = std::slice::from_raw_parts(raw.keyword_text.cast::<u8>().add(off), len);
484 KeywordEntry {
485 token_type: AnyTokenType(code),
486 keyword: std::str::from_utf8_unchecked(bytes),
487 }
488 };
489 self.idx += 1;
490 Some(entry)
491 }
492
493 fn size_hint(&self) -> (usize, Option<usize>) {
494 let remaining = self.count - self.idx;
495 (remaining, Some(remaining))
496 }
497}
498
499impl ExactSizeIterator for KeywordIter<'_> {}
500
501// ── ffi ───────────────────────────────────────────────────────────────────────
502
503pub(crate) mod ffi {
504 use crate::util::ffi::CCflags;
505
506 /// Mirrors C `SynqTokenCategory` enum defined in
507 /// `include/syntaqlite/grammar.h`.
508 #[repr(u8)]
509 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
510 pub(crate) enum CTokenCategory {
511 Other = 0,
512 Keyword = 1,
513 Identifier = 2,
514 String = 3,
515 Number = 4,
516 Operator = 5,
517 Punctuation = 6,
518 Comment = 7,
519 Variable = 8,
520 Function = 9,
521 Type = 10,
522 }
523
524 impl CTokenCategory {
525 /// Convert a raw byte from the grammar table to a `CTokenCategory`.
526 /// Unknown values map to `Other`.
527 pub(crate) fn from_u8(v: u8) -> Self {
528 match v {
529 1 => Self::Keyword,
530 2 => Self::Identifier,
531 3 => Self::String,
532 4 => Self::Number,
533 5 => Self::Operator,
534 6 => Self::Punctuation,
535 7 => Self::Comment,
536 8 => Self::Variable,
537 9 => Self::Function,
538 10 => Self::Type,
539 _ => Self::Other,
540 }
541 }
542 }
543
544 /// Mirrors C `SyntaqliteGrammarTemplate` struct defined in
545 /// `include/syntaqlite/grammar.h`.
546 #[repr(C)]
547 pub(crate) struct CGrammarTemplate {
548 pub(crate) name: *const std::ffi::c_char,
549
550 // Range metadata
551 pub(crate) range_meta: *const std::ffi::c_void,
552
553 // AST metadata
554 pub(crate) node_count: u32,
555 pub(crate) node_names: *const *const std::ffi::c_char,
556 pub(crate) field_meta: *const *const CFieldMeta,
557 pub(crate) field_meta_counts: *const u8,
558 pub(crate) list_tags: *const u8,
559
560 // Parser lifecycle (function pointers provided by grammar)
561 pub(crate) parser_alloc: *const std::ffi::c_void,
562 pub(crate) parser_init: *const std::ffi::c_void,
563 pub(crate) parser_finalize: *const std::ffi::c_void,
564 pub(crate) parser_free: *const std::ffi::c_void,
565 pub(crate) parser_feed: *const std::ffi::c_void,
566 pub(crate) parser_trace: *const std::ffi::c_void,
567 pub(crate) parser_expected_tokens: *const std::ffi::c_void,
568 pub(crate) parser_completion_context: *const std::ffi::c_void,
569
570 // Tokenizer (function pointer provided by grammar)
571 pub(crate) get_token: *const std::ffi::c_void,
572
573 // Keyword table metadata
574 pub(crate) keyword_text: *const std::ffi::c_char,
575 pub(crate) keyword_offsets: *const u16,
576 pub(crate) keyword_lens: *const u8,
577 pub(crate) keyword_codes: *const u8,
578 pub(crate) keyword_count: *const u32,
579
580 // Token metadata (indexed by token type ordinal)
581 pub(crate) token_categories: *const u8,
582 pub(crate) token_type_count: u32,
583
584 // Macro invocation style
585 pub(crate) macro_style: u32,
586 }
587
588 /// Mirrors C `SyntaqliteGrammar` from `include/syntaqlite/grammar.h`.
589 #[repr(C)]
590 #[derive(Debug, Clone, Copy)]
591 pub struct CGrammar {
592 pub(crate) template: *const CGrammarTemplate,
593 pub(crate) sqlite_version: i32,
594 pub(crate) cflags: CCflags,
595 }
596
597 /// Mirrors C `SyntaqliteFieldMeta` from `include/syntaqlite_dialect/dialect_types.h`.
598 #[repr(C)]
599 pub(crate) struct CFieldMeta {
600 pub(crate) offset: u16,
601 pub(crate) kind: u8,
602 pub(crate) name: *const std::ffi::c_char,
603 pub(crate) display: *const *const std::ffi::c_char,
604 pub(crate) display_count: u8,
605 }
606}