plotnik_lib/parser/
cst.rs

1//! Syntax kinds for the query language.
2//!
3//! `SyntaxKind` serves dual roles: token kinds (from lexer) and node kinds (from parser).
4//! Logos derives token recognition; node kinds lack token/regex attributes.
5//! `QLang` implements Rowan's `Language` trait for tree construction.
6
7#![allow(dead_code)] // Some items are for future use
8
9use logos::Logos;
10use rowan::Language;
11
12/// All token and node kinds. Tokens first, then nodes, then `__LAST` sentinel.
13/// `#[repr(u16)]` enables safe transmute in `kind_from_raw`.
14#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
15#[repr(u16)]
16pub enum SyntaxKind {
17    #[token("(")]
18    ParenOpen = 0,
19
20    #[token(")")]
21    ParenClose,
22
23    #[token("[")]
24    BracketOpen,
25
26    #[token("]")]
27    BracketClose,
28
29    #[token("{")]
30    BraceOpen,
31
32    #[token("}")]
33    BraceClose,
34
35    /// `::` for type annotations. Defined before `Colon` for correct precedence.
36    #[token("::")]
37    DoubleColon,
38
39    #[token(":")]
40    Colon,
41
42    #[token("=")]
43    Equals,
44
45    #[token("!")]
46    Negation,
47
48    #[token("-")]
49    Minus,
50
51    #[token("~")]
52    Tilde,
53
54    #[token("_")]
55    Underscore,
56
57    #[token("*")]
58    Star,
59
60    #[token("+")]
61    Plus,
62
63    #[token("?")]
64    Question,
65
66    /// Non-greedy `*?` quantifier
67    #[token("*?")]
68    StarQuestion,
69
70    /// Non-greedy `+?` quantifier
71    #[token("+?")]
72    PlusQuestion,
73
74    /// Non-greedy `??` quantifier
75    #[token("??")]
76    QuestionQuestion,
77
78    /// Slash for supertype paths: `(expression/binary_expression)`
79    #[token("/")]
80    Slash,
81
82    /// Comma (invalid separator, for error recovery)
83    #[token(",")]
84    Comma,
85
86    /// Pipe (invalid separator, for error recovery)
87    #[token("|")]
88    Pipe,
89
90    #[regex(r#""(?:[^"\\]|\\.)*""#)]
91    #[regex(r"'(?:[^'\\]|\\.)*'")]
92    #[doc(hidden)]
93    StringLiteral, // Lexer-internal only
94
95    DoubleQuote,
96    SingleQuote,
97    /// String content between quotes
98    StrVal,
99
100    #[token("ERROR")]
101    KwError,
102
103    #[token("MISSING")]
104    KwMissing,
105
106    /// Identifier. Accepts dots/hyphens for tree-sitter compat; parser validates per context.
107    /// Defined after keywords so they take precedence.
108    #[regex(r"[a-zA-Z][a-zA-Z0-9_.\-]*")]
109    Id,
110
111    #[token(".")]
112    Dot,
113
114    /// Regular capture: @name (matches before `At`)
115    #[regex(r"@[a-zA-Z][a-zA-Z0-9_]*")]
116    CaptureToken,
117
118    /// Suppressive capture: @_ or @_name (matches before `At`)
119    #[regex(r"@_[a-zA-Z0-9_]*")]
120    SuppressiveCapture,
121
122    /// Bare @ (for error recovery: "capture without target")
123    #[token("@")]
124    At,
125
126    #[regex(r"[ \t]+")]
127    Whitespace,
128
129    #[token("\n")]
130    #[token("\r\n")]
131    Newline,
132
133    #[regex(r"//[^\n]*", allow_greedy = true)]
134    #[regex(r";[^\n]*", allow_greedy = true)]
135    LineComment,
136
137    #[regex(r"/\*(?:[^*]|\*[^/])*\*/")]
138    BlockComment,
139
140    /// XML-like tags matched as errors (common LLM output)
141    #[regex(r"<[a-zA-Z_:][a-zA-Z0-9_:\.\-]*(?:\s+[^>]*)?>")]
142    #[regex(r"</[a-zA-Z_:][a-zA-Z0-9_:\.\-]*\s*>")]
143    #[regex(r"<[a-zA-Z_:][a-zA-Z0-9_:\.\-]*\s*/\s*>")]
144    XMLGarbage,
145    /// Tree-sitter predicates (unsupported)
146    #[regex(r"#[a-zA-Z_][a-zA-Z0-9_]*[?!]?")]
147    Predicate,
148    /// Coalesced unrecognized characters
149    Garbage,
150    Error,
151
152    Root,
153    Tree,
154    Ref,
155    Str,
156    Field,
157    Capture,
158    Type,
159    Quantifier,
160    Seq,
161    Alt,
162    Branch,
163    Wildcard,
164    Anchor,
165    NegatedField,
166    Def,
167
168    // Must be last - used for bounds checking in `kind_from_raw`
169    #[doc(hidden)]
170    __LAST,
171}
172
173use SyntaxKind::*;
174
175impl SyntaxKind {
176    #[inline]
177    pub fn is_trivia(self) -> bool {
178        matches!(self, Whitespace | Newline | LineComment | BlockComment)
179    }
180
181    #[inline]
182    pub fn is_error(self) -> bool {
183        matches!(self, Error | XMLGarbage | Garbage | Predicate)
184    }
185}
186
187impl From<SyntaxKind> for rowan::SyntaxKind {
188    #[inline]
189    fn from(kind: SyntaxKind) -> Self {
190        Self(kind as u16)
191    }
192}
193
194/// Language tag for Rowan's tree types.
195#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
196pub enum QLang {}
197
198impl Language for QLang {
199    type Kind = SyntaxKind;
200
201    fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
202        assert!(raw.0 < __LAST as u16);
203        // SAFETY: We've verified the value is in bounds, and SyntaxKind is repr(u16)
204        unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
205    }
206
207    fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
208        kind.into()
209    }
210}
211
212/// Type aliases for Rowan types parameterized by our language.
213pub type SyntaxNode = rowan::SyntaxNode<QLang>;
214pub type SyntaxToken = rowan::SyntaxToken<QLang>;
215pub type SyntaxElement = rowan::NodeOrToken<SyntaxNode, SyntaxToken>;
216
217/// 64-bit bitset of `SyntaxKind`s for O(1) membership testing.
218#[derive(Clone, Copy, PartialEq, Eq)]
219pub struct TokenSet(u64);
220
221impl TokenSet {
222    /// Creates an empty token set.
223    pub const EMPTY: TokenSet = TokenSet(0);
224
225    /// Panics at compile time if any kind's discriminant >= 64.
226    #[inline]
227    pub const fn new(kinds: &[SyntaxKind]) -> Self {
228        let mut bits = 0u64;
229        let mut i = 0;
230        while i < kinds.len() {
231            let kind = kinds[i] as u16;
232            assert!(kind < 64, "SyntaxKind value exceeds TokenSet capacity");
233            bits |= 1 << kind;
234            i += 1;
235        }
236        TokenSet(bits)
237    }
238
239    #[inline]
240    pub const fn single(kind: SyntaxKind) -> Self {
241        let kind = kind as u16;
242        assert!(kind < 64, "SyntaxKind value exceeds TokenSet capacity");
243        TokenSet(1 << kind)
244    }
245
246    #[inline]
247    pub const fn contains(&self, kind: SyntaxKind) -> bool {
248        let kind = kind as u16;
249        if kind >= 64 {
250            return false;
251        }
252        self.0 & (1 << kind) != 0
253    }
254
255    #[inline]
256    pub const fn union(self, other: TokenSet) -> TokenSet {
257        TokenSet(self.0 | other.0)
258    }
259}
260
261impl std::fmt::Debug for TokenSet {
262    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
263        let mut list = f.debug_set();
264        for i in 0..64u16 {
265            if self.0 & (1 << i) != 0 && i < __LAST as u16 {
266                let kind: SyntaxKind = unsafe { std::mem::transmute(i) };
267                list.entry(&kind);
268            }
269        }
270        list.finish()
271    }
272}
273
274/// Pre-defined token sets for the parser.
275pub mod token_sets {
276    use super::*;
277
278    /// FIRST set of expr. `At` excluded (captures wrap, not start).
279    pub const EXPR_FIRST_TOKENS: TokenSet = TokenSet::new(&[
280        ParenOpen,
281        BracketOpen,
282        BraceOpen,
283        Underscore,
284        Id,
285        DoubleQuote,
286        SingleQuote,
287        Dot,
288        Negation,
289        Minus,
290        KwError,
291        KwMissing,
292    ]);
293
294    /// FIRST set for root-level expressions. Excludes `Dot`/`Negation` (tree-internal).
295    pub const ROOT_EXPR_FIRST_TOKENS: TokenSet = TokenSet::new(&[
296        ParenOpen,
297        BracketOpen,
298        BraceOpen,
299        Underscore,
300        Id,
301        DoubleQuote,
302        SingleQuote,
303        KwError,
304        KwMissing,
305    ]);
306
307    pub const QUANTIFIERS: TokenSet = TokenSet::new(&[
308        Star,
309        Plus,
310        Question,
311        StarQuestion,
312        PlusQuestion,
313        QuestionQuestion,
314    ]);
315
316    pub const TRIVIA: TokenSet = TokenSet::new(&[Whitespace, Newline, LineComment, BlockComment]);
317    pub const SEPARATORS: TokenSet = TokenSet::new(&[Comma, Pipe]);
318
319    pub const TREE_RECOVERY_TOKENS: TokenSet = TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen]);
320
321    pub const ALT_RECOVERY_TOKENS: TokenSet = TokenSet::new(&[ParenClose]);
322
323    pub const FIELD_RECOVERY_TOKENS: TokenSet = TokenSet::new(&[
324        ParenClose,
325        BracketClose,
326        BraceClose,
327        CaptureToken,
328        SuppressiveCapture,
329        Colon,
330    ]);
331
332    pub const ROOT_RECOVERY_TOKENS: TokenSet =
333        TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen, Id]);
334
335    pub const DEF_RECOVERY_TOKENS: TokenSet =
336        TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen, Id, Equals]);
337
338    pub const SEQ_RECOVERY_TOKENS: TokenSet =
339        TokenSet::new(&[BraceClose, ParenClose, BracketClose]);
340}