plotnik_compiler/parser/
cst.rs

1//! Syntax kinds for the query language.
2//!
3//! `SyntaxKind` serves dual roles: token kinds (from lexer) and node kinds (from parser).
4//! Logos derives token recognition; node kinds lack token/regex attributes.
5//! `QLang` implements Rowan's `Language` trait for tree construction.
6
7#![allow(dead_code)] // Some items are for future use
8
9use logos::Logos;
10use rowan::Language;
11
12/// All token and node kinds. Tokens first, then nodes, then `__LAST` sentinel.
13/// `#[repr(u16)]` enables safe transmute in `kind_from_raw`.
14#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
15#[repr(u16)]
16pub enum SyntaxKind {
17    #[token("(")]
18    ParenOpen = 0,
19
20    #[token(")")]
21    ParenClose,
22
23    #[token("[")]
24    BracketOpen,
25
26    #[token("]")]
27    BracketClose,
28
29    #[token("{")]
30    BraceOpen,
31
32    #[token("}")]
33    BraceClose,
34
35    /// `::` for type annotations. Defined before `Colon` for correct precedence.
36    #[token("::")]
37    DoubleColon,
38
39    #[token(":")]
40    Colon,
41
42    #[token("=")]
43    Equals,
44
45    #[token("!")]
46    Negation,
47
48    #[token("-")]
49    Minus,
50
51    #[token("~")]
52    Tilde,
53
54    #[token("_")]
55    Underscore,
56
57    #[token("*")]
58    Star,
59
60    #[token("+")]
61    Plus,
62
63    #[token("?")]
64    Question,
65
66    /// Non-greedy `*?` quantifier
67    #[token("*?")]
68    StarQuestion,
69
70    /// Non-greedy `+?` quantifier
71    #[token("+?")]
72    PlusQuestion,
73
74    /// Non-greedy `??` quantifier
75    #[token("??")]
76    QuestionQuestion,
77
78    /// Slash for supertype paths: `(expression/binary_expression)`
79    #[token("/")]
80    Slash,
81
82    /// Comma (invalid separator, for error recovery)
83    #[token(",")]
84    Comma,
85
86    /// Pipe (invalid separator, for error recovery)
87    #[token("|")]
88    Pipe,
89
90    #[regex(r#""(?:[^"\\]|\\.)*""#)]
91    #[regex(r"'(?:[^'\\]|\\.)*'")]
92    #[doc(hidden)]
93    StringLiteral, // Lexer-internal only
94
95    DoubleQuote,
96    SingleQuote,
97    /// String content between quotes
98    StrVal,
99
100    #[token("ERROR")]
101    KwError,
102
103    #[token("MISSING")]
104    KwMissing,
105
106    /// Identifier. Accepts dots/hyphens for tree-sitter compat; parser validates per context.
107    /// Defined after keywords so they take precedence.
108    #[regex(r"[a-zA-Z][a-zA-Z0-9_.\-]*")]
109    Id,
110
111    #[token(".")]
112    Dot,
113
114    /// Regular capture: @name (matches before `At`)
115    #[regex(r"@[a-zA-Z][a-zA-Z0-9_]*")]
116    CaptureToken,
117
118    /// Suppressive capture: @_ or @_name (matches before `At`)
119    #[regex(r"@_[a-zA-Z0-9_]*")]
120    SuppressiveCapture,
121
122    /// Bare @ (for error recovery: "capture without target")
123    #[token("@")]
124    At,
125
126    #[regex(r"[ \t]+")]
127    Whitespace,
128
129    #[token("\n")]
130    #[token("\r\n")]
131    Newline,
132
133    #[regex(r"//[^\n]*", allow_greedy = true)]
134    #[regex(r";[^\n]*", allow_greedy = true)]
135    LineComment,
136
137    #[regex(r"/\*(?:[^*]|\*[^/])*\*/")]
138    BlockComment,
139
140    /// `==` for predicate equals
141    #[token("==")]
142    OpEq,
143
144    /// `!=` for predicate not equals
145    #[token("!=")]
146    OpNe,
147
148    /// `^=` for predicate starts-with
149    #[token("^=")]
150    OpStartsWith,
151
152    /// `$=` for predicate ends-with
153    #[token("$=")]
154    OpEndsWith,
155
156    /// `*=` for predicate contains (defined after `Star` for correct precedence)
157    #[token("*=")]
158    OpContains,
159
160    /// `=~` for predicate regex match (when followed by string or error)
161    #[token("=~")]
162    OpRegexMatch,
163
164    /// `!~` for predicate regex no-match (when followed by string or error)
165    #[token("!~")]
166    OpRegexNoMatch,
167
168    /// `=~` followed by regex literal: `=~ /pattern/`
169    /// Compound token to avoid `//` being lexed as line comment.
170    #[regex(r"=~[ \t\r\n]*/", lex_regex_predicate)]
171    RegexPredicateMatch,
172
173    /// `!~` followed by regex literal: `!~ /pattern/`
174    #[regex(r"!~[ \t\r\n]*/", lex_regex_predicate)]
175    RegexPredicateNoMatch,
176
177    /// Regex literal token (after splitting compound predicate)
178    RegexLiteral,
179
180    /// Regex pattern content (between slashes, set by parser)
181    RegexContent,
182
183    /// XML-like tags matched as errors (common LLM output)
184    #[regex(r"<[a-zA-Z_:][a-zA-Z0-9_:\.\-]*(?:\s+[^>]*)?>")]
185    #[regex(r"</[a-zA-Z_:][a-zA-Z0-9_:\.\-]*\s*>")]
186    #[regex(r"<[a-zA-Z_:][a-zA-Z0-9_:\.\-]*\s*/\s*>")]
187    XMLGarbage,
188    /// Tree-sitter predicates (unsupported)
189    #[regex(r"#[a-zA-Z_][a-zA-Z0-9_]*[?!]?")]
190    TsPredicate,
191    /// Coalesced unrecognized characters
192    Garbage,
193    Error,
194
195    Root,
196    Tree,
197    Ref,
198    Str,
199    Field,
200    Capture,
201    Type,
202    Quantifier,
203    Seq,
204    Alt,
205    Branch,
206    Wildcard,
207    Anchor,
208    NegatedField,
209    Def,
210    /// Predicate on a node: `(identifier == "foo")`
211    NodePredicate,
212    /// Regex literal: `/pattern/`
213    Regex,
214
215    // Must be last - used for bounds checking in `kind_from_raw`
216    #[doc(hidden)]
217    __LAST,
218}
219
220use SyntaxKind::*;
221
222/// Logos callback for regex predicate tokens.
223/// Called after matching `=~\s*/` or `!~\s*/`, consumes until closing unescaped `/`.
224fn lex_regex_predicate(lexer: &mut logos::Lexer<SyntaxKind>) -> bool {
225    let remaining = lexer.remainder();
226    let mut backslash_count = 0;
227
228    for (i, c) in remaining.char_indices() {
229        if c == '/' && backslash_count % 2 == 0 {
230            // Found unescaped closing slash
231            lexer.bump(i + 1);
232            return true;
233        }
234        backslash_count = if c == '\\' { backslash_count + 1 } else { 0 };
235    }
236
237    // No closing slash - consume rest as unclosed regex (parser will error)
238    lexer.bump(remaining.len());
239    true
240}
241
242impl SyntaxKind {
243    #[inline]
244    pub fn is_trivia(self) -> bool {
245        matches!(self, Whitespace | Newline | LineComment | BlockComment)
246    }
247
248    #[inline]
249    pub fn is_error(self) -> bool {
250        matches!(self, Error | XMLGarbage | Garbage | TsPredicate)
251    }
252}
253
254impl From<SyntaxKind> for rowan::SyntaxKind {
255    #[inline]
256    fn from(kind: SyntaxKind) -> Self {
257        Self(kind as u16)
258    }
259}
260
261/// Language tag for Rowan's tree types.
262#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
263pub enum QLang {}
264
265impl Language for QLang {
266    type Kind = SyntaxKind;
267
268    fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
269        assert!(raw.0 < __LAST as u16);
270        // SAFETY: We've verified the value is in bounds, and SyntaxKind is repr(u16)
271        unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
272    }
273
274    fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
275        kind.into()
276    }
277}
278
279/// Type aliases for Rowan types parameterized by our language.
280pub type SyntaxNode = rowan::SyntaxNode<QLang>;
281pub type SyntaxToken = rowan::SyntaxToken<QLang>;
282pub type SyntaxElement = rowan::NodeOrToken<SyntaxNode, SyntaxToken>;
283
284/// 128-bit bitset of `SyntaxKind`s for O(1) membership testing.
285#[derive(Clone, Copy, PartialEq, Eq)]
286pub struct TokenSet(u128);
287
288impl TokenSet {
289    /// Creates an empty token set.
290    pub const EMPTY: TokenSet = TokenSet(0);
291
292    /// Panics at compile time if any kind's discriminant >= 128.
293    #[inline]
294    pub const fn new(kinds: &[SyntaxKind]) -> Self {
295        let mut bits = 0u128;
296        let mut i = 0;
297        while i < kinds.len() {
298            let kind = kinds[i] as u16;
299            assert!(kind < 128, "SyntaxKind value exceeds TokenSet capacity");
300            bits |= 1 << kind;
301            i += 1;
302        }
303        TokenSet(bits)
304    }
305
306    #[inline]
307    pub const fn single(kind: SyntaxKind) -> Self {
308        let kind = kind as u16;
309        assert!(kind < 128, "SyntaxKind value exceeds TokenSet capacity");
310        TokenSet(1 << kind)
311    }
312
313    #[inline]
314    pub const fn contains(&self, kind: SyntaxKind) -> bool {
315        let kind = kind as u16;
316        if kind >= 128 {
317            return false;
318        }
319        self.0 & (1 << kind) != 0
320    }
321
322    #[inline]
323    pub const fn union(self, other: TokenSet) -> TokenSet {
324        TokenSet(self.0 | other.0)
325    }
326}
327
328impl std::fmt::Debug for TokenSet {
329    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
330        let mut list = f.debug_set();
331        for i in 0..128u16 {
332            if self.0 & (1 << i) != 0 && i < __LAST as u16 {
333                let kind: SyntaxKind = unsafe { std::mem::transmute(i) };
334                list.entry(&kind);
335            }
336        }
337        list.finish()
338    }
339}
340
341/// Pre-defined token sets for the parser.
342pub mod token_sets {
343    use super::*;
344
345    /// FIRST set of expr. `At` excluded (captures wrap, not start).
346    pub const EXPR_FIRST_TOKENS: TokenSet = TokenSet::new(&[
347        ParenOpen,
348        BracketOpen,
349        BraceOpen,
350        Underscore,
351        Id,
352        DoubleQuote,
353        SingleQuote,
354        Dot,
355        Negation,
356        Minus,
357        KwError,
358        KwMissing,
359    ]);
360
361    /// FIRST set for root-level expressions. Excludes `Dot`/`Negation` (tree-internal).
362    pub const ROOT_EXPR_FIRST_TOKENS: TokenSet = TokenSet::new(&[
363        ParenOpen,
364        BracketOpen,
365        BraceOpen,
366        Underscore,
367        Id,
368        DoubleQuote,
369        SingleQuote,
370        KwError,
371        KwMissing,
372    ]);
373
374    pub const QUANTIFIERS: TokenSet = TokenSet::new(&[
375        Star,
376        Plus,
377        Question,
378        StarQuestion,
379        PlusQuestion,
380        QuestionQuestion,
381    ]);
382
383    pub const TRIVIA: TokenSet = TokenSet::new(&[Whitespace, Newline, LineComment, BlockComment]);
384    pub const SEPARATORS: TokenSet = TokenSet::new(&[Comma, Pipe]);
385
386    pub const TREE_RECOVERY_TOKENS: TokenSet = TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen]);
387
388    pub const ALT_RECOVERY_TOKENS: TokenSet = TokenSet::new(&[ParenClose]);
389
390    pub const FIELD_RECOVERY_TOKENS: TokenSet = TokenSet::new(&[
391        ParenClose,
392        BracketClose,
393        BraceClose,
394        CaptureToken,
395        SuppressiveCapture,
396        Colon,
397    ]);
398
399    pub const ROOT_RECOVERY_TOKENS: TokenSet =
400        TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen, Id]);
401
402    pub const DEF_RECOVERY_TOKENS: TokenSet =
403        TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen, Id, Equals]);
404
405    pub const SEQ_RECOVERY_TOKENS: TokenSet =
406        TokenSet::new(&[BraceClose, ParenClose, BracketClose]);
407
408    pub const PREDICATE_OPS: TokenSet = TokenSet::new(&[
409        OpEq,
410        OpNe,
411        OpStartsWith,
412        OpEndsWith,
413        OpContains,
414        OpRegexMatch,
415        OpRegexNoMatch,
416    ]);
417}
plotnik_compiler/parser/cst.rs

plotnik_compiler/parser/
cst.rs