plotnik_lib/parser/
cst.rs

1//! Syntax kinds for the query language.
2//!
3//! `SyntaxKind` serves dual roles: token kinds (from lexer) and node kinds (from parser).
4//! Logos derives token recognition; node kinds lack token/regex attributes.
5//! `QLang` implements Rowan's `Language` trait for tree construction.
6
7#![allow(dead_code)] // Some items are for future use
8
9use logos::Logos;
10use rowan::Language;
11
12/// All token and node kinds. Tokens first, then nodes, then `__LAST` sentinel.
13/// `#[repr(u16)]` enables safe transmute in `kind_from_raw`.
14#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
15#[repr(u16)]
16pub enum SyntaxKind {
17    #[token("(")]
18    ParenOpen = 0,
19
20    #[token(")")]
21    ParenClose,
22
23    #[token("[")]
24    BracketOpen,
25
26    #[token("]")]
27    BracketClose,
28
29    #[token("{")]
30    BraceOpen,
31
32    #[token("}")]
33    BraceClose,
34
35    /// `::` for type annotations. Defined before `Colon` for correct precedence.
36    #[token("::")]
37    DoubleColon,
38
39    #[token(":")]
40    Colon,
41
42    #[token("=")]
43    Equals,
44
45    #[token("!")]
46    Negation,
47
48    #[token("~")]
49    Tilde,
50
51    #[token("_")]
52    Underscore,
53
54    #[token("*")]
55    Star,
56
57    #[token("+")]
58    Plus,
59
60    #[token("?")]
61    Question,
62
63    /// Non-greedy `*?` quantifier
64    #[token("*?")]
65    StarQuestion,
66
67    /// Non-greedy `+?` quantifier
68    #[token("+?")]
69    PlusQuestion,
70
71    /// Non-greedy `??` quantifier
72    #[token("??")]
73    QuestionQuestion,
74
75    /// Slash for supertype paths: `(expression/binary_expression)`
76    #[token("/")]
77    Slash,
78
79    /// Comma (invalid separator, for error recovery)
80    #[token(",")]
81    Comma,
82
83    /// Pipe (invalid separator, for error recovery)
84    #[token("|")]
85    Pipe,
86
87    /// String literal (split by lexer into quote + content + quote)
88    #[regex(r#""(?:[^"\\]|\\.)*""#)]
89    #[regex(r"'(?:[^'\\]|\\.)*'")]
90    StringLiteral,
91
92    DoubleQuote,
93    SingleQuote,
94    /// String content between quotes
95    StrVal,
96
97    #[token("ERROR")]
98    KwError,
99
100    #[token("MISSING")]
101    KwMissing,
102
103    /// Identifier. Accepts dots/hyphens for tree-sitter compat; parser validates per context.
104    /// Defined after keywords so they take precedence.
105    #[regex(r"[a-zA-Z][a-zA-Z0-9_.\-]*")]
106    Id,
107
108    #[token(".")]
109    Dot,
110
111    #[token("@")]
112    At,
113
114    #[regex(r"[ \t]+")]
115    Whitespace,
116
117    #[token("\n")]
118    #[token("\r\n")]
119    Newline,
120
121    #[regex(r"//[^\n]*", allow_greedy = true)]
122    LineComment,
123
124    #[regex(r"/\*(?:[^*]|\*[^/])*\*/")]
125    BlockComment,
126
127    /// XML-like tags matched as errors (common LLM output)
128    #[regex(r"<[a-zA-Z_:][a-zA-Z0-9_:\.\-]*(?:\s+[^>]*)?>")]
129    #[regex(r"</[a-zA-Z_:][a-zA-Z0-9_:\.\-]*\s*>")]
130    #[regex(r"<[a-zA-Z_:][a-zA-Z0-9_:\.\-]*\s*/\s*>")]
131    XMLGarbage,
132    /// Tree-sitter predicates (unsupported)
133    #[regex(r"#[a-zA-Z_][a-zA-Z0-9_]*[?!]?")]
134    Predicate,
135    /// Coalesced unrecognized characters
136    Garbage,
137    Error,
138
139    // --- Node kinds (non-terminals) ---
140    Root,
141    Tree,
142    Ref,
143    Str,
144    Field,
145    Capture,
146    Type,
147    Quantifier,
148    Seq,
149    Alt,
150    Branch,
151    Wildcard,
152    Anchor,
153    NegatedField,
154    Def,
155
156    // Must be last - used for bounds checking in `kind_from_raw`
157    #[doc(hidden)]
158    __LAST,
159}
160
161use SyntaxKind::*;
162
163impl SyntaxKind {
164    #[inline]
165    pub fn is_trivia(self) -> bool {
166        matches!(self, Whitespace | Newline | LineComment | BlockComment)
167    }
168
169    #[inline]
170    pub fn is_error(self) -> bool {
171        matches!(self, Error | XMLGarbage | Garbage | Predicate)
172    }
173}
174
175impl From<SyntaxKind> for rowan::SyntaxKind {
176    #[inline]
177    fn from(kind: SyntaxKind) -> Self {
178        Self(kind as u16)
179    }
180}
181
182/// Language tag for Rowan's tree types.
183#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
184pub enum QLang {}
185
186impl Language for QLang {
187    type Kind = SyntaxKind;
188
189    fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
190        assert!(raw.0 < __LAST as u16);
191        // SAFETY: We've verified the value is in bounds, and SyntaxKind is repr(u16)
192        unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
193    }
194
195    fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
196        kind.into()
197    }
198}
199
200/// Type aliases for Rowan types parameterized by our language.
201pub type SyntaxNode = rowan::SyntaxNode<QLang>;
202pub type SyntaxToken = rowan::SyntaxToken<QLang>;
203pub type SyntaxElement = rowan::NodeOrToken<SyntaxNode, SyntaxToken>;
204
205/// 64-bit bitset of `SyntaxKind`s for O(1) membership testing.
206#[derive(Clone, Copy, PartialEq, Eq)]
207pub struct TokenSet(u64);
208
209impl TokenSet {
210    /// Creates an empty token set.
211    pub const EMPTY: TokenSet = TokenSet(0);
212
213    /// Panics at compile time if any kind's discriminant >= 64.
214    #[inline]
215    pub const fn new(kinds: &[SyntaxKind]) -> Self {
216        let mut bits = 0u64;
217        let mut i = 0;
218        while i < kinds.len() {
219            let kind = kinds[i] as u16;
220            assert!(kind < 64, "SyntaxKind value exceeds TokenSet capacity");
221            bits |= 1 << kind;
222            i += 1;
223        }
224        TokenSet(bits)
225    }
226
227    #[inline]
228    pub const fn single(kind: SyntaxKind) -> Self {
229        let kind = kind as u16;
230        assert!(kind < 64, "SyntaxKind value exceeds TokenSet capacity");
231        TokenSet(1 << kind)
232    }
233
234    #[inline]
235    pub const fn contains(&self, kind: SyntaxKind) -> bool {
236        let kind = kind as u16;
237        if kind >= 64 {
238            return false;
239        }
240        self.0 & (1 << kind) != 0
241    }
242
243    #[inline]
244    pub const fn union(self, other: TokenSet) -> TokenSet {
245        TokenSet(self.0 | other.0)
246    }
247}
248
249impl std::fmt::Debug for TokenSet {
250    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
251        let mut list = f.debug_set();
252        for i in 0..64u16 {
253            if self.0 & (1 << i) != 0 && i < __LAST as u16 {
254                let kind: SyntaxKind = unsafe { std::mem::transmute(i) };
255                list.entry(&kind);
256            }
257        }
258        list.finish()
259    }
260}
261
262/// Pre-defined token sets for the parser.
263pub mod token_sets {
264    use super::*;
265
266    /// FIRST set of expr. `At` excluded (captures wrap, not start).
267    pub const EXPR_FIRST: TokenSet = TokenSet::new(&[
268        ParenOpen,
269        BracketOpen,
270        BraceOpen,
271        Underscore,
272        Id,
273        DoubleQuote,
274        SingleQuote,
275        Dot,
276        Negation,
277        KwError,
278        KwMissing,
279    ]);
280
281    /// FIRST set for root-level expressions. Excludes `Dot`/`Negation` (tree-internal).
282    pub const ROOT_EXPR_FIRST: TokenSet = TokenSet::new(&[
283        ParenOpen,
284        BracketOpen,
285        BraceOpen,
286        Underscore,
287        Id,
288        DoubleQuote,
289        SingleQuote,
290        KwError,
291        KwMissing,
292    ]);
293
294    pub const QUANTIFIERS: TokenSet = TokenSet::new(&[
295        Star,
296        Plus,
297        Question,
298        StarQuestion,
299        PlusQuestion,
300        QuestionQuestion,
301    ]);
302
303    pub const TRIVIA: TokenSet = TokenSet::new(&[Whitespace, Newline, LineComment, BlockComment]);
304    pub const SEPARATORS: TokenSet = TokenSet::new(&[Comma, Pipe]);
305
306    pub const TREE_RECOVERY: TokenSet = TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen]);
307
308    pub const ALT_RECOVERY: TokenSet = TokenSet::new(&[ParenClose]);
309
310    pub const FIELD_RECOVERY: TokenSet =
311        TokenSet::new(&[ParenClose, BracketClose, BraceClose, At, Colon]);
312
313    pub const ROOT_RECOVERY: TokenSet = TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen, Id]);
314
315    pub const DEF_RECOVERY: TokenSet =
316        TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen, Id, Equals]);
317
318    pub const SEQ_RECOVERY: TokenSet = TokenSet::new(&[BraceClose, ParenClose, BracketClose]);
319}