Skip to main content

relon_parser/
syntax.rs

1//! Concrete syntax tree (CST) foundation built on `rowan`.
2//!
3//! The v2 parser produces a lossless `SyntaxNode` tree: every byte of
4//! input source — including whitespace and comments — is reachable
5//! from the root via tokens, and walking the tree back to a string
6//! yields the original bytes (verbatim).
7//!
8//! This module defines:
9//!   - [`SyntaxKind`] — the unified token + node taxonomy. Every
10//!     leaf in a `SyntaxNode` has a leaf `SyntaxKind`; every composite
11//!     branch has a node `SyntaxKind`.
12//!   - [`RelonLanguage`] — the rowan-side phantom that fixes the
13//!     `SyntaxNode` / `SyntaxToken` / `SyntaxElement` type aliases to
14//!     our `SyntaxKind`.
15//!
16//! The kinds are organised into ranges so callers can ask "is this a
17//! trivia leaf?", "is this a punctuation leaf?", "is this a composite
18//! node?" without an exhaustive match.
19
20use std::fmt;
21
22/// All token and node kinds the v2 parser produces. The discriminant is
23/// kept stable and small (`u16`) so rowan's green tree can stash it
24/// efficiently — and so adding a new kind in the middle would shift
25/// values, change the boundary checks below. Append-only is the rule:
26/// new kinds go before [`SyntaxKind::__LAST`].
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
28#[repr(u16)]
29#[allow(non_camel_case_types)]
30pub enum SyntaxKind {
31    // ----- trivia (covers every byte rowan would otherwise drop) ------
32    /// Run of `\t \n\r ` characters between meaningful tokens.
33    WHITESPACE,
34    /// `// ...` to end of line.
35    LINE_COMMENT,
36    /// `/* ... */` (may span lines).
37    BLOCK_COMMENT,
38
39    // ----- literals + identifiers ------------------------------------
40    /// Any `[A-Za-z_][A-Za-z0-9_]*` — keywords are NOT split out at
41    /// lex time; the parser checks the text where context matters
42    /// (`where`, `match`, `with`, `from`, `as`, etc.).
43    IDENT,
44    /// Integer / hex / octal / binary / float / scientific. The lexer
45    /// captures the whole literal as one token; semantic conversion
46    /// to `i64` / `f64` happens later.
47    NUMBER,
48    /// Any of: plain `"..."`, raw `r"..."` / `r#"..."#`, f-string
49    /// `f"..."` / `f#"..."#`. The whole literal — opening quote
50    /// through closing quote — is one token at the CST level. The
51    /// typed-AST layer breaks f-strings into `FString` parts.
52    STRING,
53
54    // ----- single-char punctuation -----------------------------------
55    L_BRACE,
56    R_BRACE,
57    L_BRACK,
58    R_BRACK,
59    L_PAREN,
60    R_PAREN,
61    COMMA,
62    COLON,
63    DOT,
64    /// `@` — decorator sigil.
65    AT,
66    /// `#` — directive sigil.
67    HASH,
68    /// `&` — reference sigil (`&root.x`).
69    AMP,
70    /// `?` — optional-type marker or ternary head.
71    QUESTION,
72    /// `=` — standalone assignment-position equals.
73    EQ,
74
75    // ----- multi-char punctuation / operators ------------------------
76    /// `...` spread / variadic.
77    ELLIPSIS,
78    /// `==`
79    EQ_EQ,
80    /// `!=`
81    BANG_EQ,
82    /// `<=`
83    LT_EQ,
84    /// `>=`
85    GT_EQ,
86    /// `&&`
87    AMP_AMP,
88    /// `||`
89    PIPE_PIPE,
90    /// `++`
91    PLUS_PLUS,
92    /// `=>`
93    FAT_ARROW,
94    /// `->`
95    THIN_ARROW,
96
97    // ----- single-char operators -------------------------------------
98    /// `<`
99    LT,
100    /// `>`
101    GT,
102    /// `+`
103    PLUS,
104    /// `-`
105    MINUS,
106    /// `*` — multiplication, wildcard, or spread depending on context.
107    STAR,
108    /// `/`
109    SLASH,
110    /// `%`
111    PERCENT,
112    /// `!`
113    BANG,
114    /// `|`
115    PIPE,
116    /// A bare `_` (an underscore NOT followed by another identifier
117    /// char). The Rust-style pattern wildcard for match catch-all arms
118    /// (`_: result`) and ignored variant-payload slots. A `_foo` /
119    /// `my_var` still lexes as `IDENT`. The schema-field "any-value"
120    /// validator keeps its own `*` spelling (`STAR`); the two roles are
121    /// deliberately distinct tokens.
122    UNDERSCORE,
123
124    /// Any source byte the lexer couldn't classify (stray UTF-8
125    /// punctuation, control characters, etc.). Emitted as a single-
126    /// codepoint token so the round-trip-by-bytes invariant holds.
127    /// Downstream tooling treats this like a syntax error.
128    UNKNOWN,
129
130    // ----- f-string sub-tokens ---------------------------------------
131    // The lexer emits an entire f-string as one `STRING` leaf so the
132    // round-trip-by-bytes invariant holds without any cross-token
133    // coordination. The CST builder then refines that single leaf into
134    // an `F_STRING` node containing the leaves below + nested
135    // `F_STRING_INTERPOLATION` sub-nodes (whose own children are
136    // ordinary Relon expressions). The leaves stay BEFORE `DOCUMENT`
137    // in the enum order so `is_token` keeps working.
138    /// Opening `f"` / `f#"` / `f##"` ... — `#` count varies.
139    F_STRING_OPEN,
140    /// Closing `"` / `"#` / `"##` matching the open count.
141    F_STRING_CLOSE,
142    /// Verbatim literal chunk between interpolations / quotes.
143    F_STRING_LITERAL,
144    /// `${`
145    F_STRING_INTERP_START,
146    /// Closing `}` of an interpolation.
147    F_STRING_INTERP_END,
148
149    // ----- composite-node kinds (populated through P2/P3) ------------
150    //
151    // Each kind below names a grammar production. Their byte content
152    // is reachable through their child tokens / nodes; rowan stitches
153    // it all back into source via `SyntaxNode::text`. P2 fills these
154    // in; P1 only needs `DOCUMENT` + `ERROR` to round-trip-lex.
155    //
156    /// Whole-file root. Always present. Children:
157    /// trivia*, top-level directives*, top-level value, trivia*.
158    DOCUMENT,
159    /// A `#name <body?>` form.
160    DIRECTIVE,
161    /// `@name(args?)` form.
162    DECORATOR,
163    /// `{ ... }` dict / object literal.
164    DICT,
165    /// One `key: value` (or `key(params): body`) pair inside a DICT.
166    DICT_FIELD,
167    /// `[ ... ]` list / array literal.
168    LIST,
169    /// `for x in xs if cond` body inside a LIST.
170    COMPREHENSION,
171    /// `name(p, q, ...) [-> R]: body` lowered to closure.
172    CLOSURE,
173    /// Single closure parameter (`name: T` or bare `name`).
174    CLOSURE_PARAM,
175    /// `name(arg1, arg2 = expr, ...)` call.
176    CALL_EXPR,
177    /// One arg inside a call's parens — positional or `name = expr`.
178    CALL_ARG,
179    /// Binary operation node (`a + b`, `a == b`, etc.).
180    BINARY_EXPR,
181    /// Unary operation node (`!a`, `-a`).
182    UNARY_EXPR,
183    /// `cond ? then : else`.
184    TERNARY_EXPR,
185    /// `&base.x.y` reference.
186    REFERENCE_EXPR,
187    /// `name[.tail]*` bareword path.
188    VARIABLE_EXPR,
189    /// `expr where { bindings }`.
190    WHERE_EXPR,
191    /// `expr match { type: arm, ... }`.
192    MATCH_EXPR,
193    /// One arm inside a MATCH_EXPR.
194    MATCH_ARM,
195    /// Rust-like enum payload match pattern, e.g. `Pair(a, b)`.
196    MATCH_PATTERN,
197    /// `EnumName.VariantName { ... }`.
198    VARIANT_CTOR,
199    /// `f"..."` rendered as a CST node so interpolations are children.
200    /// The lexer emits the whole f-string as one `STRING` leaf; the
201    /// CST builder breaks it into F_STRING_OPEN, F_STRING_LITERAL
202    /// chunks, F_STRING_INTERPOLATION children, and F_STRING_CLOSE.
203    F_STRING,
204    /// One `${ expr }` zone inside an [`SyntaxKind::F_STRING`]. Children are
205    /// `F_STRING_INTERP_START`, then a regular Relon expression node,
206    /// then `F_STRING_INTERP_END`.
207    F_STRING_INTERPOLATION,
208    /// Spread expression `...expr` inside a dict / list.
209    SPREAD_EXPR,
210    /// A type expression: `Int`, `List<String>`, `User?`, …
211    TYPE_NODE,
212    /// `*` in wildcard / placeholder position.
213    WILDCARD,
214    /// Literal `true` / `false` and removed `null` spelling.
215    LITERAL,
216    /// Unrecoverable parse failure: spans the bytes the parser
217    /// couldn't fit into any production. Always has at least one
218    /// child token. This is the "first-class hole" that lets
219    /// downstream tooling keep working on partial input.
220    ERROR,
221    /// `(T1, T2, ...)` tuple type — appears in type-hint position
222    /// (`(Int, String) pair: ...`) and inside generic argument lists
223    /// (`List<(Int, String)>`). The 1-tuple uses a trailing-comma
224    /// `(T,)` disambiguator; `()` is the zero-tuple.
225    TUPLE_TYPE,
226    /// `#schema ... with { ... }` body — a structured method list.
227    /// Children: one `SCHEMA_METHOD` per declaration plus any
228    /// schema-level pragma directives. The CST keeps every byte
229    /// verbatim; the typed-AST layer reads the structure.
230    SCHEMA_WITH,
231    /// One method declaration inside a [`SyntaxKind::SCHEMA_WITH`] block.
232    /// Children: optional pragma directives (`#derive`, `#native`,
233    /// `#internal`), an IDENT method name, optional `<T>` generics,
234    /// `CLOSURE_PARAM` list, a TYPE_NODE return type, and an
235    /// expression body (omitted when `#native` is set).
236    SCHEMA_METHOD,
237    /// One variant inside a Rust-like `#enum Name { ... }` declaration.
238    ENUM_VARIANT,
239    /// One named payload field inside a `#enum` variant body.
240    ENUM_VARIANT_FIELD,
241    /// `(e1, e2, ...)` tuple value literal. Distinct from a
242    /// parenthesised group `(e)` (which carries no comma) and from the
243    /// `(p, q) => body` closure form. The 1-tuple uses a trailing-comma
244    /// `(e,)` disambiguator; `()` is the zero-tuple (unit). Children are
245    /// the element expressions in source order.
246    TUPLE,
247
248    // Append new kinds above this line.
249    /// Sentinel to keep `(SyntaxKind as u16) < (__LAST as u16)`
250    /// available for boundary checks. Never produced.
251    __LAST,
252}
253
254impl SyntaxKind {
255    /// True for `WHITESPACE` / `LINE_COMMENT` / `BLOCK_COMMENT` —
256    /// tokens that carry no semantic content. Useful for skipping
257    /// when walking the tree for meaningful structure.
258    pub fn is_trivia(self) -> bool {
259        matches!(
260            self,
261            SyntaxKind::WHITESPACE | SyntaxKind::LINE_COMMENT | SyntaxKind::BLOCK_COMMENT
262        )
263    }
264
265    /// True when the kind names a leaf (token) rather than a
266    /// composite branch (node). All kinds before `DOCUMENT` in the
267    /// enum order are leaves; everything from `DOCUMENT` to `ERROR`
268    /// is a node. Keep in sync with the enum layout above.
269    pub fn is_token(self) -> bool {
270        (self as u16) < (SyntaxKind::DOCUMENT as u16)
271    }
272
273    pub fn is_node(self) -> bool {
274        !self.is_token()
275    }
276}
277
278impl fmt::Display for SyntaxKind {
279    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
280        write!(f, "{:?}", self)
281    }
282}
283
284impl From<SyntaxKind> for rowan::SyntaxKind {
285    fn from(kind: SyntaxKind) -> Self {
286        rowan::SyntaxKind(kind as u16)
287    }
288}
289
290/// rowan-side phantom that ties [`SyntaxKind`] to rowan's tree
291/// generics. Don't construct an instance — it's used only at the
292/// type level.
293#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
294pub enum RelonLanguage {}
295
296impl rowan::Language for RelonLanguage {
297    type Kind = SyntaxKind;
298
299    fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
300        SyntaxKind::from_raw(raw.0).unwrap_or_else(|| panic!("raw kind out of range: {raw:?}"))
301    }
302
303    fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
304        kind.into()
305    }
306}
307
308impl SyntaxKind {
309    /// Round-trip back from the raw `u16` rowan stores in its green
310    /// tree. Total over the enum's domain; returns `None` for any
311    /// out-of-range value. The match is exhaustive so the compiler
312    /// catches missing entries when new kinds are appended.
313    pub fn from_raw(raw: u16) -> Option<Self> {
314        let kind = match raw {
315            x if x == Self::WHITESPACE as u16 => Self::WHITESPACE,
316            x if x == Self::LINE_COMMENT as u16 => Self::LINE_COMMENT,
317            x if x == Self::BLOCK_COMMENT as u16 => Self::BLOCK_COMMENT,
318            x if x == Self::IDENT as u16 => Self::IDENT,
319            x if x == Self::NUMBER as u16 => Self::NUMBER,
320            x if x == Self::STRING as u16 => Self::STRING,
321            x if x == Self::L_BRACE as u16 => Self::L_BRACE,
322            x if x == Self::R_BRACE as u16 => Self::R_BRACE,
323            x if x == Self::L_BRACK as u16 => Self::L_BRACK,
324            x if x == Self::R_BRACK as u16 => Self::R_BRACK,
325            x if x == Self::L_PAREN as u16 => Self::L_PAREN,
326            x if x == Self::R_PAREN as u16 => Self::R_PAREN,
327            x if x == Self::COMMA as u16 => Self::COMMA,
328            x if x == Self::COLON as u16 => Self::COLON,
329            x if x == Self::DOT as u16 => Self::DOT,
330            x if x == Self::AT as u16 => Self::AT,
331            x if x == Self::HASH as u16 => Self::HASH,
332            x if x == Self::AMP as u16 => Self::AMP,
333            x if x == Self::QUESTION as u16 => Self::QUESTION,
334            x if x == Self::EQ as u16 => Self::EQ,
335            x if x == Self::ELLIPSIS as u16 => Self::ELLIPSIS,
336            x if x == Self::EQ_EQ as u16 => Self::EQ_EQ,
337            x if x == Self::BANG_EQ as u16 => Self::BANG_EQ,
338            x if x == Self::LT_EQ as u16 => Self::LT_EQ,
339            x if x == Self::GT_EQ as u16 => Self::GT_EQ,
340            x if x == Self::AMP_AMP as u16 => Self::AMP_AMP,
341            x if x == Self::PIPE_PIPE as u16 => Self::PIPE_PIPE,
342            x if x == Self::PLUS_PLUS as u16 => Self::PLUS_PLUS,
343            x if x == Self::FAT_ARROW as u16 => Self::FAT_ARROW,
344            x if x == Self::THIN_ARROW as u16 => Self::THIN_ARROW,
345            x if x == Self::LT as u16 => Self::LT,
346            x if x == Self::GT as u16 => Self::GT,
347            x if x == Self::PLUS as u16 => Self::PLUS,
348            x if x == Self::MINUS as u16 => Self::MINUS,
349            x if x == Self::STAR as u16 => Self::STAR,
350            x if x == Self::SLASH as u16 => Self::SLASH,
351            x if x == Self::PERCENT as u16 => Self::PERCENT,
352            x if x == Self::BANG as u16 => Self::BANG,
353            x if x == Self::PIPE as u16 => Self::PIPE,
354            x if x == Self::UNDERSCORE as u16 => Self::UNDERSCORE,
355            x if x == Self::UNKNOWN as u16 => Self::UNKNOWN,
356            x if x == Self::F_STRING_OPEN as u16 => Self::F_STRING_OPEN,
357            x if x == Self::F_STRING_CLOSE as u16 => Self::F_STRING_CLOSE,
358            x if x == Self::F_STRING_LITERAL as u16 => Self::F_STRING_LITERAL,
359            x if x == Self::F_STRING_INTERP_START as u16 => Self::F_STRING_INTERP_START,
360            x if x == Self::F_STRING_INTERP_END as u16 => Self::F_STRING_INTERP_END,
361            x if x == Self::DOCUMENT as u16 => Self::DOCUMENT,
362            x if x == Self::DIRECTIVE as u16 => Self::DIRECTIVE,
363            x if x == Self::DECORATOR as u16 => Self::DECORATOR,
364            x if x == Self::DICT as u16 => Self::DICT,
365            x if x == Self::DICT_FIELD as u16 => Self::DICT_FIELD,
366            x if x == Self::LIST as u16 => Self::LIST,
367            x if x == Self::COMPREHENSION as u16 => Self::COMPREHENSION,
368            x if x == Self::CLOSURE as u16 => Self::CLOSURE,
369            x if x == Self::CLOSURE_PARAM as u16 => Self::CLOSURE_PARAM,
370            x if x == Self::CALL_EXPR as u16 => Self::CALL_EXPR,
371            x if x == Self::CALL_ARG as u16 => Self::CALL_ARG,
372            x if x == Self::BINARY_EXPR as u16 => Self::BINARY_EXPR,
373            x if x == Self::UNARY_EXPR as u16 => Self::UNARY_EXPR,
374            x if x == Self::TERNARY_EXPR as u16 => Self::TERNARY_EXPR,
375            x if x == Self::REFERENCE_EXPR as u16 => Self::REFERENCE_EXPR,
376            x if x == Self::VARIABLE_EXPR as u16 => Self::VARIABLE_EXPR,
377            x if x == Self::WHERE_EXPR as u16 => Self::WHERE_EXPR,
378            x if x == Self::MATCH_EXPR as u16 => Self::MATCH_EXPR,
379            x if x == Self::MATCH_ARM as u16 => Self::MATCH_ARM,
380            x if x == Self::MATCH_PATTERN as u16 => Self::MATCH_PATTERN,
381            x if x == Self::VARIANT_CTOR as u16 => Self::VARIANT_CTOR,
382            x if x == Self::F_STRING as u16 => Self::F_STRING,
383            x if x == Self::F_STRING_INTERPOLATION as u16 => Self::F_STRING_INTERPOLATION,
384            x if x == Self::SPREAD_EXPR as u16 => Self::SPREAD_EXPR,
385            x if x == Self::TYPE_NODE as u16 => Self::TYPE_NODE,
386            x if x == Self::WILDCARD as u16 => Self::WILDCARD,
387            x if x == Self::LITERAL as u16 => Self::LITERAL,
388            x if x == Self::ERROR as u16 => Self::ERROR,
389            x if x == Self::TUPLE_TYPE as u16 => Self::TUPLE_TYPE,
390            x if x == Self::SCHEMA_WITH as u16 => Self::SCHEMA_WITH,
391            x if x == Self::SCHEMA_METHOD as u16 => Self::SCHEMA_METHOD,
392            x if x == Self::ENUM_VARIANT as u16 => Self::ENUM_VARIANT,
393            x if x == Self::ENUM_VARIANT_FIELD as u16 => Self::ENUM_VARIANT_FIELD,
394            x if x == Self::TUPLE as u16 => Self::TUPLE,
395            _ => return None,
396        };
397        Some(kind)
398    }
399}
400
401/// Convenience aliases. The vast majority of consumers should reach
402/// for these instead of touching rowan generics directly.
403pub type SyntaxNode = rowan::SyntaxNode<RelonLanguage>;
404pub type SyntaxToken = rowan::SyntaxToken<RelonLanguage>;
405pub type SyntaxElement = rowan::SyntaxElement<RelonLanguage>;
406
407#[cfg(test)]
408mod tests {
409    use super::*;
410    use rowan::Language;
411
412    #[test]
413    fn trivia_classification() {
414        assert!(SyntaxKind::WHITESPACE.is_trivia());
415        assert!(SyntaxKind::LINE_COMMENT.is_trivia());
416        assert!(SyntaxKind::BLOCK_COMMENT.is_trivia());
417        assert!(!SyntaxKind::IDENT.is_trivia());
418        assert!(!SyntaxKind::DOCUMENT.is_trivia());
419    }
420
421    #[test]
422    fn token_vs_node_split() {
423        // Every kind before DOCUMENT is a token; everything from
424        // DOCUMENT through ERROR is a node.
425        assert!(SyntaxKind::WHITESPACE.is_token());
426        assert!(SyntaxKind::IDENT.is_token());
427        assert!(SyntaxKind::EQ.is_token());
428        assert!(SyntaxKind::PIPE.is_token());
429        assert!(SyntaxKind::DOCUMENT.is_node());
430        assert!(SyntaxKind::DICT.is_node());
431        assert!(SyntaxKind::ERROR.is_node());
432    }
433
434    #[test]
435    fn round_trip_through_rowan_language() {
436        // Sanity: every leaf + node kind round-trips through
437        // `kind_to_raw` ∘ `kind_from_raw` — guards against any
438        // accidental enum-layout drift.
439        for kind in [
440            SyntaxKind::WHITESPACE,
441            SyntaxKind::IDENT,
442            SyntaxKind::NUMBER,
443            SyntaxKind::STRING,
444            SyntaxKind::HASH,
445            SyntaxKind::DOCUMENT,
446            SyntaxKind::DICT,
447            SyntaxKind::CLOSURE,
448            SyntaxKind::ERROR,
449        ] {
450            let raw = RelonLanguage::kind_to_raw(kind);
451            assert_eq!(RelonLanguage::kind_from_raw(raw), kind);
452        }
453    }
454}