syntaqlite_syntax/grammar.rs
1// Copyright 2025 The syntaqlite Authors. All rights reserved.
2// Licensed under the Apache License, Version 2.0.
3
4// ── Public API ───────────────────────────────────────────────────────────────
5
6use crate::any::{AnyNodeTag, AnyTokenType};
7use crate::util::{SqliteSyntaxFlags, SqliteVersion};
8
9/// Runtime field-value shape used when reflecting over AST nodes.
10///
11/// This powers grammar-agnostic tooling that inspects nodes without generated
12/// Rust types.
13#[repr(u8)]
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum FieldKind {
16 /// A child node identifier.
17 NodeId = 0,
18 /// A source span (byte offset + length).
19 Span = 1,
20 /// A boolean flag.
21 Bool = 2,
22 /// A compact bitfield of flags.
23 Flags = 3,
24 /// A discriminant for an enum variant.
25 Enum = 4,
26}
27
28impl FieldKind {
29 fn from_u8(v: u8) -> Self {
30 match v {
31 1 => FieldKind::Span,
32 2 => FieldKind::Bool,
33 3 => FieldKind::Flags,
34 4 => FieldKind::Enum,
35 _ => FieldKind::NodeId,
36 }
37 }
38}
39
40/// High-level semantic class of a token.
41///
42/// Commonly used for syntax highlighting, token styling, and lightweight
43/// heuristics before full semantic analysis.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum TokenCategory {
46 /// SQL keyword (SELECT, FROM, WHERE, …)
47 Keyword,
48 /// Bind parameter or session variable (`:name`, `@var`, `?`)
49 Parameter,
50 /// String literal or blob literal
51 String,
52 /// Numeric literal
53 Number,
54 /// Operator or comparison symbol (`+`, `=`, `||`, …)
55 Operator,
56 /// Comment (`-- …` or `/* … */`)
57 Comment,
58 /// Punctuation (`,`, `(`, `)`, `;`, …)
59 Punctuation,
60 /// Quoted or unquoted identifier
61 Identifier,
62 /// Built-in or user-defined function name
63 Function,
64 /// Type name (in CAST, column definitions, …)
65 Type,
66 /// Anything that doesn't fall into the above categories
67 Other,
68}
69
70impl From<ffi::CTokenCategory> for TokenCategory {
71 fn from(c: ffi::CTokenCategory) -> Self {
72 match c {
73 ffi::CTokenCategory::Keyword => Self::Keyword,
74 ffi::CTokenCategory::Identifier => Self::Identifier,
75 ffi::CTokenCategory::String => Self::String,
76 ffi::CTokenCategory::Number => Self::Number,
77 ffi::CTokenCategory::Operator => Self::Operator,
78 ffi::CTokenCategory::Punctuation => Self::Punctuation,
79 ffi::CTokenCategory::Comment => Self::Comment,
80 ffi::CTokenCategory::Variable => Self::Parameter,
81 ffi::CTokenCategory::Function => Self::Function,
82 ffi::CTokenCategory::Type => Self::Type,
83 ffi::CTokenCategory::Other => Self::Other,
84 }
85 }
86}
87
88/// Metadata for one AST field of one node type.
89///
90/// Use this to build generic inspectors, serializers, or debug UIs that can
91/// walk arbitrary grammars.
92pub struct FieldMeta<'a>(pub(crate) &'a ffi::CFieldMeta);
93
94impl FieldMeta<'_> {
95 /// Byte offset of this field within its parent AST node struct.
96 pub fn offset(&self) -> u16 {
97 self.0.offset
98 }
99
100 /// Semantic kind of this field.
101 pub fn kind(&self) -> FieldKind {
102 FieldKind::from_u8(self.0.kind)
103 }
104
105 /// The field name as a `&str`.
106 ///
107 /// # Panics
108 /// Panics if the grammar table contains invalid UTF-8 in the field name
109 /// (which would indicate a codegen bug).
110 pub fn name(&self) -> &'static str {
111 // SAFETY: `FieldMeta` is only constructed from static grammar tables
112 // where `name` is always a valid, NUL-terminated UTF-8 C string.
113 unsafe {
114 let cstr = std::ffi::CStr::from_ptr(self.0.name);
115 cstr.to_str().expect("invalid UTF-8 in field name")
116 }
117 }
118
119 /// The `idx`-th display name for enum variants, if present.
120 ///
121 /// # Panics
122 /// Panics if the grammar table contains invalid UTF-8 in a display name
123 /// (which would indicate a codegen bug).
124 pub fn display_name(&self, idx: usize) -> Option<&'static str> {
125 if self.0.display.is_null() || idx >= self.0.display_count as usize {
126 return None;
127 }
128 // SAFETY: `FieldMeta` is only constructed from static grammar tables;
129 // `display` and its entries are valid static C strings.
130 unsafe {
131 let ptr = *self.0.display.add(idx);
132 if ptr.is_null() {
133 return None;
134 }
135 let cstr = std::ffi::CStr::from_ptr(ptr);
136 Some(cstr.to_str().expect("invalid UTF-8 in display name"))
137 }
138 }
139
140 /// Number of display names for this field.
141 pub fn display_count(&self) -> usize {
142 self.0.display_count as usize
143 }
144}
145
146/// Parser-inferred semantic usage for an individual token occurrence.
147///
148/// This complements lexical token kind and helps distinguish ambiguous tokens
149/// (for example keyword text used as an identifier).
150#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
151pub struct ParserTokenFlags(u8);
152
153impl ParserTokenFlags {
154 /// Construct from a raw C flag bitfield (`SyntaqliteParserTokenFlags = uint32_t`).
155 pub(crate) fn from_raw(v: u32) -> Self {
156 let bits = u8::try_from(v).expect("parser token flags out of range for u8");
157 ParserTokenFlags(bits)
158 }
159
160 // Bit positions — mirror C SYNQ_TOKEN_FLAG_* in syntaqlite/parser.h.
161 const AS_ID: u8 = 1;
162 const AS_FUNCTION: u8 = 2;
163 const AS_TYPE: u8 = 4;
164
165 /// Returns the underlying flag bits.
166 pub fn bits(self) -> u8 {
167 self.0
168 }
169
170 /// True if the token was used as an identifier (`SYNQ_TOKEN_FLAG_AS_ID`).
171 pub fn used_as_identifier(self) -> bool {
172 self.0 & Self::AS_ID != 0
173 }
174
175 /// True if the token was used as a function name (`SYNQ_TOKEN_FLAG_AS_FUNCTION`).
176 pub fn used_as_function(self) -> bool {
177 self.0 & Self::AS_FUNCTION != 0
178 }
179
180 /// True if the token was used as a type name (`SYNQ_TOKEN_FLAG_AS_TYPE`).
181 pub fn used_as_type(self) -> bool {
182 self.0 & Self::AS_TYPE != 0
183 }
184}
185
186/// Trait implemented by generated grammar handles.
187///
188/// End users typically consume implementations rather than writing them.
189/// The trait links a grammar to its typed node and token enums.
190pub trait TypedGrammar: Clone + Into<AnyGrammar> {
191 /// The top-level typed AST node enum for this grammar.
192 type Node<'a>: crate::ast::GrammarNodeType<'a>;
193 /// The grammar's typed node ID, wrapping an [`crate::ast::AnyNodeId`].
194 ///
195 /// Used as the return type of [`TypedNodeList::node_id`](crate::ast::TypedNodeList::node_id)
196 /// so callers get a grammar-typed handle rather than a raw [`crate::ast::AnyNodeId`].
197 type NodeId: Copy + From<crate::ast::AnyNodeId> + Into<crate::ast::AnyNodeId>;
198 /// The typed token enum for this grammar.
199 type Token: crate::ast::GrammarTokenType;
200}
201
202/// Grammar handle for runtime-configurable, grammar-agnostic workflows.
203///
204/// Use `AnyGrammar` when grammar selection/configuration is dynamic (plugins,
205/// LSP hosts, multi-grammar test harnesses). It carries version/cflag knobs
206/// and introspection metadata, while remaining cheap to clone.
207///
208/// Built-in grammars hold `&'static` C data directly. Dynamically loaded
209/// grammars transmute library-memory pointers to `&'static` and keep the
210/// library alive via an [`Arc`](std::sync::Arc). Use `AnyGrammar::load` to create one.
211#[derive(Clone)]
212pub struct AnyGrammar {
213 pub(crate) inner: ffi::CGrammar,
214 /// Keeps the shared library alive for dynamically-loaded grammars.
215 /// `None` for built-in (static) grammars.
216 _keep_alive: Option<std::sync::Arc<dyn Send + Sync>>,
217}
218
219// SAFETY: The grammar wraps an immutable reference to static C data.
220unsafe impl Send for AnyGrammar {}
221// SAFETY: AnyGrammar wraps a *const CGrammar to a static C grammar object; it is safe to share across threads.
222unsafe impl Sync for AnyGrammar {}
223
224impl AnyGrammar {
225 /// Construct a `AnyGrammar` from a raw C grammar value.\
226 ///
227 /// This unsafe method exists only for use by grammar implementations which are code generated.
228 /// End users should never need to call this directly.
229 ///
230 /// # Safety
231 /// The `template` pointer inside `inner` must point to valid, `'static`
232 /// C grammar tables (e.g. returned by a grammar's `extern "C"` grammar
233 /// accessor such as `syntaqlite_sqlite_grammar()`).
234 pub unsafe fn new(inner: ffi::CGrammar) -> Self {
235 AnyGrammar {
236 inner,
237 _keep_alive: None,
238 }
239 }
240
241 /// Load a grammar from a shared library (`.so` / `.dylib` / `.dll`).
242 ///
243 /// Resolves `syntaqlite_<name>_grammar` (or `syntaqlite_grammar` when `name`
244 /// is `None`) and calls it to obtain the grammar handle.
245 ///
246 /// # Errors
247 /// Returns `Err` if the library cannot be opened or the grammar symbol is absent.
248 ///
249 /// # Library lifetime
250 /// The loaded library is kept alive via an [`Arc`] stored inside the
251 /// returned `AnyGrammar`. Dropping the last clone of the grammar unloads
252 /// the library. Use [`syntaqlite::Dialect::load`] for dialect-level loading.
253 #[cfg(feature = "dynload")]
254 pub fn load(path: &str, name: Option<&str>) -> Result<Self, String> {
255 // SAFETY: We keep `lib` alive in an `Arc` below so the grammar pointer
256 // lives as long as any clone of the returned AnyGrammar.
257 let lib = unsafe {
258 libloading::Library::new(path).map_err(|e| format!("failed to load {path:?}: {e}"))?
259 };
260
261 let symbol = match name {
262 Some(n) => format!("syntaqlite_{n}_grammar"),
263 None => "syntaqlite_grammar".to_string(),
264 };
265 // SAFETY: We call the function immediately and drop `func` before `lib`
266 // is moved into the Arc, so there is no lifetime overlap issue.
267 let raw: ffi::CGrammar = unsafe {
268 let func: libloading::Symbol<'_, unsafe extern "C" fn() -> ffi::CGrammar> = lib
269 .get(symbol.as_bytes())
270 .map_err(|e| format!("symbol {symbol:?} not found in {path:?}: {e}"))?;
271 func()
272 };
273
274 let keep_alive: std::sync::Arc<dyn Send + Sync> = std::sync::Arc::new(lib);
275
276 // SAFETY: `raw.template` points into the shared library kept alive by
277 // `keep_alive`. Dropping the last AnyGrammar clone unloads the library.
278 Ok(AnyGrammar {
279 inner: raw,
280 _keep_alive: Some(keep_alive),
281 })
282 }
283
284 /// Pin this grammar handle to a target `SQLite` version.
285 ///
286 /// Useful when your product must emulate a specific engine release.
287 #[must_use]
288 pub fn with_version(mut self, version: SqliteVersion) -> Self {
289 self.inner.sqlite_version = version.as_int();
290 self
291 }
292
293 /// Replace compile-time compatibility flags on this handle.
294 #[must_use]
295 pub fn with_cflags(mut self, flags: SqliteSyntaxFlags) -> Self {
296 self.inner.cflags = flags.0;
297 self
298 }
299
300 /// Target `SQLite` version currently configured on this handle.
301 pub fn version(&self) -> SqliteVersion {
302 SqliteVersion::from_int(self.inner.sqlite_version)
303 }
304
305 /// Active C-parser compile-time compatibility flags.
306 pub fn cflags(&self) -> SqliteSyntaxFlags {
307 SqliteSyntaxFlags(self.inner.cflags)
308 }
309
310 /// Return a reference to the abstract grammar template.
311 #[inline]
312 fn template(&self) -> &'static ffi::CGrammarTemplate {
313 // SAFETY: `inner.template` points to static C data (generated grammar tables).
314 unsafe { &*self.inner.template }
315 }
316
317 /// Return the human-readable node name for `tag`.
318 ///
319 /// # Panics
320 /// Panics if `tag` is out of bounds for this grammar.
321 pub fn node_name(&self, tag: AnyNodeTag) -> &'static str {
322 let raw = self.template();
323 let idx = tag.0 as usize;
324 assert!(
325 idx < raw.node_count as usize,
326 "node tag {} out of bounds (count={})",
327 idx,
328 raw.node_count,
329 );
330 // SAFETY: idx is bounds-checked above; node_names is a static array of
331 // length node_count populated by codegen, with valid NUL-terminated strings.
332 unsafe {
333 let cstr = std::ffi::CStr::from_ptr(*raw.node_names.add(idx));
334 cstr.to_str().expect("invalid UTF-8 in node name")
335 }
336 }
337
338 /// Whether `tag` identifies a list node shape.
339 pub fn is_list(&self, tag: AnyNodeTag) -> bool {
340 let raw = self.template();
341 let idx = tag.0 as usize;
342 if idx >= raw.node_count as usize {
343 return false;
344 }
345 // SAFETY: idx is bounds-checked above; list_tags is a static array of
346 // length node_count populated by codegen.
347 unsafe { *raw.list_tags.add(idx) != 0 }
348 }
349
350 /// Return field metadata for nodes with tag `tag`.
351 pub fn field_meta(&self, tag: AnyNodeTag) -> impl ExactSizeIterator<Item = FieldMeta<'static>> {
352 let raw = self.template();
353 let idx = tag.0 as usize;
354 // SAFETY: idx is bounds-checked; field_meta_counts and field_meta are
355 // parallel static arrays of length node_count populated by codegen.
356 let slice: &'static [ffi::CFieldMeta] = unsafe {
357 if idx >= raw.node_count as usize {
358 &[]
359 } else {
360 let count = *raw.field_meta_counts.add(idx) as usize;
361 let ptr = *raw.field_meta.add(idx);
362 if count == 0 || ptr.is_null() {
363 &[]
364 } else {
365 std::slice::from_raw_parts(ptr, count)
366 }
367 }
368 };
369 slice.iter().map(FieldMeta)
370 }
371
372 /// Classify a token for presentation/analysis using parser context when available.
373 pub fn classify_token(
374 &self,
375 token_type: AnyTokenType,
376 flags: ParserTokenFlags,
377 ) -> TokenCategory {
378 if flags.used_as_function() {
379 TokenCategory::Function
380 } else if flags.used_as_type() {
381 TokenCategory::Type
382 } else if flags.used_as_identifier() {
383 TokenCategory::Identifier
384 } else {
385 self.token_category(token_type)
386 }
387 }
388
389 /// Return the default semantic category for a token type ordinal.
390 pub fn token_category(&self, token_type: AnyTokenType) -> TokenCategory {
391 let raw = self.template();
392 let idx = token_type.0 as usize;
393 if raw.token_categories.is_null() || idx >= raw.token_type_count as usize {
394 return TokenCategory::Other;
395 }
396 // SAFETY: token_categories is null-checked; it is a static array of
397 // length token_type_count populated by codegen.
398 let byte = unsafe { *raw.token_categories.add(idx) };
399 TokenCategory::from(ffi::CTokenCategory::from_u8(byte))
400 }
401
402 /// Iterate all keywords known to this grammar.
403 ///
404 /// Yields a [`KeywordEntry`] for each keyword, containing the token type
405 /// ordinal and the keyword lexeme (e.g. `SELECT`, `WHERE`).
406 ///
407 /// The iterator implements [`ExactSizeIterator`], so `.len()` gives the
408 /// total keyword count without consuming the iterator.
409 pub fn keywords(&self) -> impl ExactSizeIterator<Item = KeywordEntry> + '_ {
410 let raw = self.template();
411 let count = if raw.keyword_text.is_null()
412 || raw.keyword_offsets.is_null()
413 || raw.keyword_lens.is_null()
414 || raw.keyword_codes.is_null()
415 || raw.keyword_count.is_null()
416 {
417 0
418 } else {
419 // SAFETY: keyword_count is null-checked above; points to a static u32.
420 unsafe { *raw.keyword_count as usize }
421 };
422 KeywordIter {
423 grammar: self,
424 idx: 0,
425 count,
426 }
427 }
428}
429
430impl TypedGrammar for AnyGrammar {
431 type Node<'a> = crate::ast::AnyNode<'a>;
432 type NodeId = crate::ast::AnyNodeId;
433 type Token = AnyTokenType;
434}
435
436/// One grammar keyword entry.
437///
438/// Yielded by [`AnyGrammar::keywords`] for completions, lexers, and tooling.
439#[derive(Debug, Clone, Copy, PartialEq, Eq)]
440pub struct KeywordEntry {
441 /// The token type for this keyword.
442 token_type: AnyTokenType,
443 /// The keyword lexeme (e.g. `"SELECT"`, `"WHERE"`).
444 keyword: &'static str,
445}
446
447impl KeywordEntry {
448 /// The token type for this keyword.
449 pub fn token_type(&self) -> AnyTokenType {
450 self.token_type
451 }
452 /// The keyword lexeme (e.g. `"SELECT"`, `"WHERE"`).
453 pub fn keyword(&self) -> &'static str {
454 self.keyword
455 }
456}
457
458struct KeywordIter<'a> {
459 grammar: &'a AnyGrammar,
460 idx: usize,
461 count: usize,
462}
463
464impl Iterator for KeywordIter<'_> {
465 type Item = KeywordEntry;
466
467 fn next(&mut self) -> Option<KeywordEntry> {
468 if self.idx >= self.count {
469 return None;
470 }
471 let raw = self.grammar.template();
472 // SAFETY: all keyword pointers were null-checked in `keywords()`; arrays
473 // are static, length = self.count, and self.idx < self.count.
474 let entry = unsafe {
475 let code = u32::from(*raw.keyword_codes.add(self.idx));
476 let len = *raw.keyword_lens.add(self.idx) as usize;
477 let off = *raw.keyword_offsets.add(self.idx) as usize;
478 let bytes = std::slice::from_raw_parts(raw.keyword_text.cast::<u8>().add(off), len);
479 KeywordEntry {
480 token_type: AnyTokenType(code),
481 keyword: std::str::from_utf8_unchecked(bytes),
482 }
483 };
484 self.idx += 1;
485 Some(entry)
486 }
487
488 fn size_hint(&self) -> (usize, Option<usize>) {
489 let remaining = self.count - self.idx;
490 (remaining, Some(remaining))
491 }
492}
493
494impl ExactSizeIterator for KeywordIter<'_> {}
495
496// ── ffi ───────────────────────────────────────────────────────────────────────
497
498pub(crate) mod ffi {
499 use crate::util::ffi::CCflags;
500
501 /// Mirrors C `SynqTokenCategory` enum defined in
502 /// `include/syntaqlite/grammar.h`.
503 #[repr(u8)]
504 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
505 pub(crate) enum CTokenCategory {
506 Other = 0,
507 Keyword = 1,
508 Identifier = 2,
509 String = 3,
510 Number = 4,
511 Operator = 5,
512 Punctuation = 6,
513 Comment = 7,
514 Variable = 8,
515 Function = 9,
516 Type = 10,
517 }
518
519 impl CTokenCategory {
520 /// Convert a raw byte from the grammar table to a `CTokenCategory`.
521 /// Unknown values map to `Other`.
522 pub(crate) fn from_u8(v: u8) -> Self {
523 match v {
524 1 => Self::Keyword,
525 2 => Self::Identifier,
526 3 => Self::String,
527 4 => Self::Number,
528 5 => Self::Operator,
529 6 => Self::Punctuation,
530 7 => Self::Comment,
531 8 => Self::Variable,
532 9 => Self::Function,
533 10 => Self::Type,
534 _ => Self::Other,
535 }
536 }
537 }
538
539 /// Mirrors C `SyntaqliteGrammarTemplate` struct defined in
540 /// `include/syntaqlite/grammar.h`.
541 #[repr(C)]
542 pub(crate) struct CGrammarTemplate {
543 pub(crate) name: *const std::ffi::c_char,
544
545 // Range metadata
546 pub(crate) range_meta: *const std::ffi::c_void,
547
548 // AST metadata
549 pub(crate) node_count: u32,
550 pub(crate) node_names: *const *const std::ffi::c_char,
551 pub(crate) field_meta: *const *const CFieldMeta,
552 pub(crate) field_meta_counts: *const u8,
553 pub(crate) list_tags: *const u8,
554
555 // Parser lifecycle (function pointers provided by grammar)
556 pub(crate) parser_alloc: *const std::ffi::c_void,
557 pub(crate) parser_init: *const std::ffi::c_void,
558 pub(crate) parser_finalize: *const std::ffi::c_void,
559 pub(crate) parser_free: *const std::ffi::c_void,
560 pub(crate) parser_feed: *const std::ffi::c_void,
561 pub(crate) parser_trace: *const std::ffi::c_void,
562 pub(crate) parser_expected_tokens: *const std::ffi::c_void,
563 pub(crate) parser_completion_context: *const std::ffi::c_void,
564
565 // Tokenizer (function pointer provided by grammar)
566 pub(crate) get_token: *const std::ffi::c_void,
567
568 // Keyword table metadata
569 pub(crate) keyword_text: *const std::ffi::c_char,
570 pub(crate) keyword_offsets: *const u16,
571 pub(crate) keyword_lens: *const u8,
572 pub(crate) keyword_codes: *const u8,
573 pub(crate) keyword_count: *const u32,
574
575 // Token metadata (indexed by token type ordinal)
576 pub(crate) token_categories: *const u8,
577 pub(crate) token_type_count: u32,
578 }
579
580 /// Mirrors C `SyntaqliteGrammar` from `include/syntaqlite/grammar.h`.
581 #[repr(C)]
582 #[derive(Debug, Clone, Copy)]
583 pub struct CGrammar {
584 pub(crate) template: *const CGrammarTemplate,
585 pub(crate) sqlite_version: i32,
586 pub(crate) cflags: CCflags,
587 }
588
589 /// Mirrors C `SyntaqliteFieldMeta` from `include/syntaqlite_dialect/dialect_types.h`.
590 #[repr(C)]
591 pub(crate) struct CFieldMeta {
592 pub(crate) offset: u16,
593 pub(crate) kind: u8,
594 pub(crate) name: *const std::ffi::c_char,
595 pub(crate) display: *const *const std::ffi::c_char,
596 pub(crate) display_count: u8,
597 }
598}