relon_parser/syntax.rs
1//! Concrete syntax tree (CST) foundation built on `rowan`.
2//!
3//! The v2 parser produces a lossless `SyntaxNode` tree: every byte of
4//! input source — including whitespace and comments — is reachable
5//! from the root via tokens, and walking the tree back to a string
6//! yields the original bytes (verbatim).
7//!
8//! This module defines:
9//! - [`SyntaxKind`] — the unified token + node taxonomy. Every
10//! leaf in a `SyntaxNode` has a leaf `SyntaxKind`; every composite
11//! branch has a node `SyntaxKind`.
12//! - [`RelonLanguage`] — the rowan-side phantom that fixes the
13//! `SyntaxNode` / `SyntaxToken` / `SyntaxElement` type aliases to
14//! our `SyntaxKind`.
15//!
16//! The kinds are organised into ranges so callers can ask "is this a
17//! trivia leaf?", "is this a punctuation leaf?", "is this a composite
18//! node?" without an exhaustive match.
19
20use std::fmt;
21
22/// All token and node kinds the v2 parser produces. The discriminant is
23/// kept stable and small (`u16`) so rowan's green tree can stash it
24/// efficiently — and so adding a new kind in the middle would shift
25/// values, change the boundary checks below. Append-only is the rule:
26/// new kinds go before [`SyntaxKind::__LAST`].
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
28#[repr(u16)]
29#[allow(non_camel_case_types)]
30pub enum SyntaxKind {
31 // ----- trivia (covers every byte rowan would otherwise drop) ------
32 /// Run of `\t \n\r ` characters between meaningful tokens.
33 WHITESPACE,
34 /// `// ...` to end of line.
35 LINE_COMMENT,
36 /// `/* ... */` (may span lines).
37 BLOCK_COMMENT,
38
39 // ----- literals + identifiers ------------------------------------
40 /// Any `[A-Za-z_][A-Za-z0-9_]*` — keywords are NOT split out at
41 /// lex time; the parser checks the text where context matters
42 /// (`where`, `match`, `with`, `from`, `as`, etc.).
43 IDENT,
44 /// Integer / hex / octal / binary / float / scientific. The lexer
45 /// captures the whole literal as one token; semantic conversion
46 /// to `i64` / `f64` happens later.
47 NUMBER,
48 /// Any of: plain `"..."`, raw `r"..."` / `r#"..."#`, f-string
49 /// `f"..."` / `f#"..."#`. The whole literal — opening quote
50 /// through closing quote — is one token at the CST level. The
51 /// typed-AST layer breaks f-strings into `FString` parts.
52 STRING,
53
54 // ----- single-char punctuation -----------------------------------
55 L_BRACE,
56 R_BRACE,
57 L_BRACK,
58 R_BRACK,
59 L_PAREN,
60 R_PAREN,
61 COMMA,
62 COLON,
63 DOT,
64 /// `@` — decorator sigil.
65 AT,
66 /// `#` — directive sigil.
67 HASH,
68 /// `&` — reference sigil (`&root.x`).
69 AMP,
70 /// `?` — optional-type marker or ternary head.
71 QUESTION,
72 /// `=` — standalone assignment-position equals.
73 EQ,
74
75 // ----- multi-char punctuation / operators ------------------------
76 /// `...` spread / variadic.
77 ELLIPSIS,
78 /// `==`
79 EQ_EQ,
80 /// `!=`
81 BANG_EQ,
82 /// `<=`
83 LT_EQ,
84 /// `>=`
85 GT_EQ,
86 /// `&&`
87 AMP_AMP,
88 /// `||`
89 PIPE_PIPE,
90 /// `++`
91 PLUS_PLUS,
92 /// `=>`
93 FAT_ARROW,
94 /// `->`
95 THIN_ARROW,
96
97 // ----- single-char operators -------------------------------------
98 /// `<`
99 LT,
100 /// `>`
101 GT,
102 /// `+`
103 PLUS,
104 /// `-`
105 MINUS,
106 /// `*` — multiplication, wildcard, or spread depending on context.
107 STAR,
108 /// `/`
109 SLASH,
110 /// `%`
111 PERCENT,
112 /// `!`
113 BANG,
114 /// `|`
115 PIPE,
116 /// A bare `_` (an underscore NOT followed by another identifier
117 /// char). The Rust-style pattern wildcard for match catch-all arms
118 /// (`_: result`) and ignored variant-payload slots. A `_foo` /
119 /// `my_var` still lexes as `IDENT`. The schema-field "any-value"
120 /// validator keeps its own `*` spelling (`STAR`); the two roles are
121 /// deliberately distinct tokens.
122 UNDERSCORE,
123
124 /// Any source byte the lexer couldn't classify (stray UTF-8
125 /// punctuation, control characters, etc.). Emitted as a single-
126 /// codepoint token so the round-trip-by-bytes invariant holds.
127 /// Downstream tooling treats this like a syntax error.
128 UNKNOWN,
129
130 // ----- f-string sub-tokens ---------------------------------------
131 // The lexer emits an entire f-string as one `STRING` leaf so the
132 // round-trip-by-bytes invariant holds without any cross-token
133 // coordination. The CST builder then refines that single leaf into
134 // an `F_STRING` node containing the leaves below + nested
135 // `F_STRING_INTERPOLATION` sub-nodes (whose own children are
136 // ordinary Relon expressions). The leaves stay BEFORE `DOCUMENT`
137 // in the enum order so `is_token` keeps working.
138 /// Opening `f"` / `f#"` / `f##"` ... — `#` count varies.
139 F_STRING_OPEN,
140 /// Closing `"` / `"#` / `"##` matching the open count.
141 F_STRING_CLOSE,
142 /// Verbatim literal chunk between interpolations / quotes.
143 F_STRING_LITERAL,
144 /// `${`
145 F_STRING_INTERP_START,
146 /// Closing `}` of an interpolation.
147 F_STRING_INTERP_END,
148
149 // ----- composite-node kinds (populated through P2/P3) ------------
150 //
151 // Each kind below names a grammar production. Their byte content
152 // is reachable through their child tokens / nodes; rowan stitches
153 // it all back into source via `SyntaxNode::text`. P2 fills these
154 // in; P1 only needs `DOCUMENT` + `ERROR` to round-trip-lex.
155 //
156 /// Whole-file root. Always present. Children:
157 /// trivia*, top-level directives*, top-level value, trivia*.
158 DOCUMENT,
159 /// A `#name <body?>` form.
160 DIRECTIVE,
161 /// `@name(args?)` form.
162 DECORATOR,
163 /// `{ ... }` dict / object literal.
164 DICT,
165 /// One `key: value` (or `key(params): body`) pair inside a DICT.
166 DICT_FIELD,
167 /// `[ ... ]` list / array literal.
168 LIST,
169 /// `for x in xs if cond` body inside a LIST.
170 COMPREHENSION,
171 /// `name(p, q, ...) [-> R]: body` lowered to closure.
172 CLOSURE,
173 /// Single closure parameter (`name: T` or bare `name`).
174 CLOSURE_PARAM,
175 /// `name(arg1, arg2 = expr, ...)` call.
176 CALL_EXPR,
177 /// One arg inside a call's parens — positional or `name = expr`.
178 CALL_ARG,
179 /// Binary operation node (`a + b`, `a == b`, etc.).
180 BINARY_EXPR,
181 /// Unary operation node (`!a`, `-a`).
182 UNARY_EXPR,
183 /// `cond ? then : else`.
184 TERNARY_EXPR,
185 /// `&base.x.y` reference.
186 REFERENCE_EXPR,
187 /// `name[.tail]*` bareword path.
188 VARIABLE_EXPR,
189 /// `expr where { bindings }`.
190 WHERE_EXPR,
191 /// `expr match { type: arm, ... }`.
192 MATCH_EXPR,
193 /// One arm inside a MATCH_EXPR.
194 MATCH_ARM,
195 /// Rust-like enum payload match pattern, e.g. `Pair(a, b)`.
196 MATCH_PATTERN,
197 /// `EnumName.VariantName { ... }`.
198 VARIANT_CTOR,
199 /// `f"..."` rendered as a CST node so interpolations are children.
200 /// The lexer emits the whole f-string as one `STRING` leaf; the
201 /// CST builder breaks it into F_STRING_OPEN, F_STRING_LITERAL
202 /// chunks, F_STRING_INTERPOLATION children, and F_STRING_CLOSE.
203 F_STRING,
204 /// One `${ expr }` zone inside an [`SyntaxKind::F_STRING`]. Children are
205 /// `F_STRING_INTERP_START`, then a regular Relon expression node,
206 /// then `F_STRING_INTERP_END`.
207 F_STRING_INTERPOLATION,
208 /// Spread expression `...expr` inside a dict / list.
209 SPREAD_EXPR,
210 /// A type expression: `Int`, `List<String>`, `User?`, …
211 TYPE_NODE,
212 /// `*` in wildcard / placeholder position.
213 WILDCARD,
214 /// Literal `true` / `false` and removed `null` spelling.
215 LITERAL,
216 /// Unrecoverable parse failure: spans the bytes the parser
217 /// couldn't fit into any production. Always has at least one
218 /// child token. This is the "first-class hole" that lets
219 /// downstream tooling keep working on partial input.
220 ERROR,
221 /// `(T1, T2, ...)` tuple type — appears in type-hint position
222 /// (`(Int, String) pair: ...`) and inside generic argument lists
223 /// (`List<(Int, String)>`). The 1-tuple uses a trailing-comma
224 /// `(T,)` disambiguator; `()` is the zero-tuple.
225 TUPLE_TYPE,
226 /// `#schema ... with { ... }` body — a structured method list.
227 /// Children: one `SCHEMA_METHOD` per declaration plus any
228 /// schema-level pragma directives. The CST keeps every byte
229 /// verbatim; the typed-AST layer reads the structure.
230 SCHEMA_WITH,
231 /// One method declaration inside a [`SyntaxKind::SCHEMA_WITH`] block.
232 /// Children: optional pragma directives (`#derive`, `#native`,
233 /// `#internal`), an IDENT method name, optional `<T>` generics,
234 /// `CLOSURE_PARAM` list, a TYPE_NODE return type, and an
235 /// expression body (omitted when `#native` is set).
236 SCHEMA_METHOD,
237 /// One variant inside a Rust-like `#enum Name { ... }` declaration.
238 ENUM_VARIANT,
239 /// One named payload field inside a `#enum` variant body.
240 ENUM_VARIANT_FIELD,
241 /// `(e1, e2, ...)` tuple value literal. Distinct from a
242 /// parenthesised group `(e)` (which carries no comma) and from the
243 /// `(p, q) => body` closure form. The 1-tuple uses a trailing-comma
244 /// `(e,)` disambiguator; `()` is the zero-tuple (unit). Children are
245 /// the element expressions in source order.
246 TUPLE,
247
248 // Append new kinds above this line.
249 /// Sentinel to keep `(SyntaxKind as u16) < (__LAST as u16)`
250 /// available for boundary checks. Never produced.
251 __LAST,
252}
253
254impl SyntaxKind {
255 /// True for `WHITESPACE` / `LINE_COMMENT` / `BLOCK_COMMENT` —
256 /// tokens that carry no semantic content. Useful for skipping
257 /// when walking the tree for meaningful structure.
258 pub fn is_trivia(self) -> bool {
259 matches!(
260 self,
261 SyntaxKind::WHITESPACE | SyntaxKind::LINE_COMMENT | SyntaxKind::BLOCK_COMMENT
262 )
263 }
264
265 /// True when the kind names a leaf (token) rather than a
266 /// composite branch (node). All kinds before `DOCUMENT` in the
267 /// enum order are leaves; everything from `DOCUMENT` to `ERROR`
268 /// is a node. Keep in sync with the enum layout above.
269 pub fn is_token(self) -> bool {
270 (self as u16) < (SyntaxKind::DOCUMENT as u16)
271 }
272
273 pub fn is_node(self) -> bool {
274 !self.is_token()
275 }
276}
277
278impl fmt::Display for SyntaxKind {
279 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
280 write!(f, "{:?}", self)
281 }
282}
283
284impl From<SyntaxKind> for rowan::SyntaxKind {
285 fn from(kind: SyntaxKind) -> Self {
286 rowan::SyntaxKind(kind as u16)
287 }
288}
289
290/// rowan-side phantom that ties [`SyntaxKind`] to rowan's tree
291/// generics. Don't construct an instance — it's used only at the
292/// type level.
293#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
294pub enum RelonLanguage {}
295
296impl rowan::Language for RelonLanguage {
297 type Kind = SyntaxKind;
298
299 fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
300 SyntaxKind::from_raw(raw.0).unwrap_or_else(|| panic!("raw kind out of range: {raw:?}"))
301 }
302
303 fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
304 kind.into()
305 }
306}
307
308impl SyntaxKind {
309 /// Round-trip back from the raw `u16` rowan stores in its green
310 /// tree. Total over the enum's domain; returns `None` for any
311 /// out-of-range value. The match is exhaustive so the compiler
312 /// catches missing entries when new kinds are appended.
313 pub fn from_raw(raw: u16) -> Option<Self> {
314 let kind = match raw {
315 x if x == Self::WHITESPACE as u16 => Self::WHITESPACE,
316 x if x == Self::LINE_COMMENT as u16 => Self::LINE_COMMENT,
317 x if x == Self::BLOCK_COMMENT as u16 => Self::BLOCK_COMMENT,
318 x if x == Self::IDENT as u16 => Self::IDENT,
319 x if x == Self::NUMBER as u16 => Self::NUMBER,
320 x if x == Self::STRING as u16 => Self::STRING,
321 x if x == Self::L_BRACE as u16 => Self::L_BRACE,
322 x if x == Self::R_BRACE as u16 => Self::R_BRACE,
323 x if x == Self::L_BRACK as u16 => Self::L_BRACK,
324 x if x == Self::R_BRACK as u16 => Self::R_BRACK,
325 x if x == Self::L_PAREN as u16 => Self::L_PAREN,
326 x if x == Self::R_PAREN as u16 => Self::R_PAREN,
327 x if x == Self::COMMA as u16 => Self::COMMA,
328 x if x == Self::COLON as u16 => Self::COLON,
329 x if x == Self::DOT as u16 => Self::DOT,
330 x if x == Self::AT as u16 => Self::AT,
331 x if x == Self::HASH as u16 => Self::HASH,
332 x if x == Self::AMP as u16 => Self::AMP,
333 x if x == Self::QUESTION as u16 => Self::QUESTION,
334 x if x == Self::EQ as u16 => Self::EQ,
335 x if x == Self::ELLIPSIS as u16 => Self::ELLIPSIS,
336 x if x == Self::EQ_EQ as u16 => Self::EQ_EQ,
337 x if x == Self::BANG_EQ as u16 => Self::BANG_EQ,
338 x if x == Self::LT_EQ as u16 => Self::LT_EQ,
339 x if x == Self::GT_EQ as u16 => Self::GT_EQ,
340 x if x == Self::AMP_AMP as u16 => Self::AMP_AMP,
341 x if x == Self::PIPE_PIPE as u16 => Self::PIPE_PIPE,
342 x if x == Self::PLUS_PLUS as u16 => Self::PLUS_PLUS,
343 x if x == Self::FAT_ARROW as u16 => Self::FAT_ARROW,
344 x if x == Self::THIN_ARROW as u16 => Self::THIN_ARROW,
345 x if x == Self::LT as u16 => Self::LT,
346 x if x == Self::GT as u16 => Self::GT,
347 x if x == Self::PLUS as u16 => Self::PLUS,
348 x if x == Self::MINUS as u16 => Self::MINUS,
349 x if x == Self::STAR as u16 => Self::STAR,
350 x if x == Self::SLASH as u16 => Self::SLASH,
351 x if x == Self::PERCENT as u16 => Self::PERCENT,
352 x if x == Self::BANG as u16 => Self::BANG,
353 x if x == Self::PIPE as u16 => Self::PIPE,
354 x if x == Self::UNDERSCORE as u16 => Self::UNDERSCORE,
355 x if x == Self::UNKNOWN as u16 => Self::UNKNOWN,
356 x if x == Self::F_STRING_OPEN as u16 => Self::F_STRING_OPEN,
357 x if x == Self::F_STRING_CLOSE as u16 => Self::F_STRING_CLOSE,
358 x if x == Self::F_STRING_LITERAL as u16 => Self::F_STRING_LITERAL,
359 x if x == Self::F_STRING_INTERP_START as u16 => Self::F_STRING_INTERP_START,
360 x if x == Self::F_STRING_INTERP_END as u16 => Self::F_STRING_INTERP_END,
361 x if x == Self::DOCUMENT as u16 => Self::DOCUMENT,
362 x if x == Self::DIRECTIVE as u16 => Self::DIRECTIVE,
363 x if x == Self::DECORATOR as u16 => Self::DECORATOR,
364 x if x == Self::DICT as u16 => Self::DICT,
365 x if x == Self::DICT_FIELD as u16 => Self::DICT_FIELD,
366 x if x == Self::LIST as u16 => Self::LIST,
367 x if x == Self::COMPREHENSION as u16 => Self::COMPREHENSION,
368 x if x == Self::CLOSURE as u16 => Self::CLOSURE,
369 x if x == Self::CLOSURE_PARAM as u16 => Self::CLOSURE_PARAM,
370 x if x == Self::CALL_EXPR as u16 => Self::CALL_EXPR,
371 x if x == Self::CALL_ARG as u16 => Self::CALL_ARG,
372 x if x == Self::BINARY_EXPR as u16 => Self::BINARY_EXPR,
373 x if x == Self::UNARY_EXPR as u16 => Self::UNARY_EXPR,
374 x if x == Self::TERNARY_EXPR as u16 => Self::TERNARY_EXPR,
375 x if x == Self::REFERENCE_EXPR as u16 => Self::REFERENCE_EXPR,
376 x if x == Self::VARIABLE_EXPR as u16 => Self::VARIABLE_EXPR,
377 x if x == Self::WHERE_EXPR as u16 => Self::WHERE_EXPR,
378 x if x == Self::MATCH_EXPR as u16 => Self::MATCH_EXPR,
379 x if x == Self::MATCH_ARM as u16 => Self::MATCH_ARM,
380 x if x == Self::MATCH_PATTERN as u16 => Self::MATCH_PATTERN,
381 x if x == Self::VARIANT_CTOR as u16 => Self::VARIANT_CTOR,
382 x if x == Self::F_STRING as u16 => Self::F_STRING,
383 x if x == Self::F_STRING_INTERPOLATION as u16 => Self::F_STRING_INTERPOLATION,
384 x if x == Self::SPREAD_EXPR as u16 => Self::SPREAD_EXPR,
385 x if x == Self::TYPE_NODE as u16 => Self::TYPE_NODE,
386 x if x == Self::WILDCARD as u16 => Self::WILDCARD,
387 x if x == Self::LITERAL as u16 => Self::LITERAL,
388 x if x == Self::ERROR as u16 => Self::ERROR,
389 x if x == Self::TUPLE_TYPE as u16 => Self::TUPLE_TYPE,
390 x if x == Self::SCHEMA_WITH as u16 => Self::SCHEMA_WITH,
391 x if x == Self::SCHEMA_METHOD as u16 => Self::SCHEMA_METHOD,
392 x if x == Self::ENUM_VARIANT as u16 => Self::ENUM_VARIANT,
393 x if x == Self::ENUM_VARIANT_FIELD as u16 => Self::ENUM_VARIANT_FIELD,
394 x if x == Self::TUPLE as u16 => Self::TUPLE,
395 _ => return None,
396 };
397 Some(kind)
398 }
399}
400
401/// Convenience aliases. The vast majority of consumers should reach
402/// for these instead of touching rowan generics directly.
403pub type SyntaxNode = rowan::SyntaxNode<RelonLanguage>;
404pub type SyntaxToken = rowan::SyntaxToken<RelonLanguage>;
405pub type SyntaxElement = rowan::SyntaxElement<RelonLanguage>;
406
407#[cfg(test)]
408mod tests {
409 use super::*;
410 use rowan::Language;
411
412 #[test]
413 fn trivia_classification() {
414 assert!(SyntaxKind::WHITESPACE.is_trivia());
415 assert!(SyntaxKind::LINE_COMMENT.is_trivia());
416 assert!(SyntaxKind::BLOCK_COMMENT.is_trivia());
417 assert!(!SyntaxKind::IDENT.is_trivia());
418 assert!(!SyntaxKind::DOCUMENT.is_trivia());
419 }
420
421 #[test]
422 fn token_vs_node_split() {
423 // Every kind before DOCUMENT is a token; everything from
424 // DOCUMENT through ERROR is a node.
425 assert!(SyntaxKind::WHITESPACE.is_token());
426 assert!(SyntaxKind::IDENT.is_token());
427 assert!(SyntaxKind::EQ.is_token());
428 assert!(SyntaxKind::PIPE.is_token());
429 assert!(SyntaxKind::DOCUMENT.is_node());
430 assert!(SyntaxKind::DICT.is_node());
431 assert!(SyntaxKind::ERROR.is_node());
432 }
433
434 #[test]
435 fn round_trip_through_rowan_language() {
436 // Sanity: every leaf + node kind round-trips through
437 // `kind_to_raw` ∘ `kind_from_raw` — guards against any
438 // accidental enum-layout drift.
439 for kind in [
440 SyntaxKind::WHITESPACE,
441 SyntaxKind::IDENT,
442 SyntaxKind::NUMBER,
443 SyntaxKind::STRING,
444 SyntaxKind::HASH,
445 SyntaxKind::DOCUMENT,
446 SyntaxKind::DICT,
447 SyntaxKind::CLOSURE,
448 SyntaxKind::ERROR,
449 ] {
450 let raw = RelonLanguage::kind_to_raw(kind);
451 assert_eq!(RelonLanguage::kind_from_raw(raw), kind);
452 }
453 }
454}