cyrs_syntax/parser.rs
1//! Event-based recovering parser (spec 0001 §4.2, §4.3, §4.4, §4.6).
2//!
3//! # Architecture
4//!
5//! The parser is a hand-written recursive-descent engine that emits an
6//! `Event` stream rather than directly mutating a tree. A separate
7//! `SyntaxTreeBuilder` — living in the internal `builder` module —
8//! consumes the events and produces the [`rowan`] green tree. This split
9//! lets the parser:
10//!
11//! - rewrite events for associativity fixing in Pratt expression parsing
12//! (the `Start` event can be retroactively relocated via the
13//! `Marker::precede` mechanism);
14//! - group or suppress spurious errors before they reach diagnostics;
15//! - interleave trivia tokens (whitespace, comments) between significant
16//! tokens so the constructed tree is byte-lossless per §4.4.
17//!
18//! # Error recovery
19//!
20//! Per spec §4.3 the recovering parser exposes two primitives the grammar
21//! builds on:
22//!
23//! - **Synchronisation-set skip** (`Parser::recover_until`): when a
24//! clause fails, tokens are bumped into an `ERROR` node until we see a
25//! clause-level keyword, a `;`, or EOF.
26//! - **Virtual-token insertion** (`Parser::expect`): when a required
27//! closer/keyword is missing, an `ERROR` node of zero width is recorded
28//! at the expected position; the parser continues.
29//!
30//! Both primitives are shared across productions; per-production recovery
31//! tables (spec §4.3) land in a later bead (`cy-2vh`).
32//!
33//! # Losslessness invariant
34//!
35//! Spec §4.4: `parse(s).syntax().to_string() == s` for every input. Trivia
36//! and `ERROR` tokens are preserved in the tree; the property test
37//! `lossless_on_arbitrary_utf8` enforces the invariant.
38
39use drop_bomb::DropBomb;
40use rowan::GreenNode;
41use thiserror::Error;
42
43use crate::{
44 SyntaxKind, SyntaxNode,
45 builder::SyntaxTreeBuilder,
46 lexer::{LexToken, lex},
47};
48
49/// A completed parse: green tree + accumulated syntax errors.
50#[derive(Debug, Clone)]
51pub struct Parse {
52 green: GreenNode,
53 errors: Vec<SyntaxError>,
54}
55
56impl Parse {
57 /// Root `SyntaxNode` view of the parsed tree. Cheap to clone —
58 /// the underlying `GreenNode` is ref-counted.
59 #[must_use]
60 pub fn syntax(&self) -> SyntaxNode {
61 SyntaxNode::new_root(self.green.clone())
62 }
63
64 /// Raw rowan green tree backing the parse. Usually
65 /// `syntax()` is what you want; `green()` is exposed for
66 /// callers that want to share the tree by reference.
67 #[must_use]
68 pub fn green(&self) -> &GreenNode {
69 &self.green
70 }
71
72 /// Accumulated syntax errors in source order. Empty when the
73 /// parse was fully clean.
74 #[must_use]
75 pub fn errors(&self) -> &[SyntaxError] {
76 &self.errors
77 }
78
79 /// Construct a `Parse` from raw parts. Crate-private — used by the
80 /// incremental smart-path splicer (cy-li5) which builds a new green
81 /// tree via `replace_with` and supplies a re-derived error list.
82 #[cfg(feature = "incremental")]
83 pub(crate) fn from_parts(green: GreenNode, errors: Vec<SyntaxError>) -> Self {
84 Parse { green, errors }
85 }
86}
87
88/// Entry point. Parses a complete source file.
89///
90/// Invariants:
91/// - `parse(src).syntax().to_string() == src` for every input (spec §4.4).
92/// - Never panics on any UTF-8 input (spec §17.3 P17.3.1).
93#[must_use]
94pub fn parse(src: &str) -> Parse {
95 let tokens = lex(src);
96 let mut parser = Parser::new(&tokens);
97 crate::grammar::source_file(&mut parser);
98 let events = parser.into_events();
99 let (green, errors) = SyntaxTreeBuilder::build(&tokens, events);
100 Parse { green, errors }
101}
102
103/// A single syntax error record. Spec §4.3 + §10.
104///
105/// `cyrs-syntax` does not depend on `cyrs-diag` (§3.1), so this is a
106/// local stand-in carrying just the minimum a later pass needs to lift it
107/// into a full `Diagnostic`: a stable numeric code (matching the
108/// `DiagCode` discriminants in `cyrs-diag`), a human-readable message,
109/// and the byte offset at which it was emitted.
110///
111/// Embedders should not match on the raw `u16` `code` field — names
112/// survive renumbering, magic numbers do not (cy-emb3). When
113/// `cyrs-diag` is on the dependency graph (typically via `cyrs-db`),
114/// lift the numeric to the typed enum:
115///
116/// ```ignore
117/// use cyrs_diag::DiagCode;
118/// match DiagCode::from(err) {
119/// DiagCode::E0007 => { /* expected statement */ }
120/// _ => { /* other syntax codes */ }
121/// }
122/// ```
123#[derive(Debug, Clone, Error)]
124#[error("[E{code:04}] {message}")]
125pub struct SyntaxError {
126 /// Numeric value of the `DiagCode` discriminant (e.g. 3 for E0003).
127 pub code: u16,
128 /// Human-readable message (rustc-style lower-case initial, no
129 /// trailing period).
130 pub message: String,
131 /// Byte offset at which the error was emitted. Typically the
132 /// position of the offending token.
133 pub offset: text_size::TextSize,
134}
135
136// ========================================================================
137// Event stream
138// ========================================================================
139
140/// Events are emitted by the parser and consumed by
141/// [`SyntaxTreeBuilder`](crate::builder::SyntaxTreeBuilder).
142///
143/// `Start` carries an optional forward-parent index implementing
144/// [`Marker::precede`] (re-parenting a started node inside a new outer
145/// node); [`Event::Abandoned`] retracts a speculatively-started node.
146///
147/// The event stream is flat: Start/Finish pairs define nesting, Token
148/// events are emitted in source order alongside trivia that the builder
149/// re-interleaves from the underlying token slice.
150#[derive(Debug)]
151pub(crate) enum Event {
152 Start {
153 kind: SyntaxKind,
154 forward_parent: Option<u32>,
155 },
156 Finish,
157 Token {
158 kind: SyntaxKind,
159 },
160 Error {
161 code: u16,
162 msg: String,
163 offset: text_size::TextSize,
164 },
165 Abandoned,
166}
167
168impl Event {
169 /// Placeholder used by the builder when it swap-removes an event
170 /// during forward-parent chain resolution.
171 pub(crate) fn tombstone() -> Self {
172 Event::Abandoned
173 }
174}
175
176// ========================================================================
177// Parser engine
178// ========================================================================
179
180/// Token-set bitmap used by `at_ts`/`recover_until`. Backed by an
181/// 8×u128 array so we can represent every [`SyntaxKind`] value in
182/// `0..1024` without collisions (the enum's high-water mark is
183/// [`SyntaxKind::EOF`] = 769). Every construction is `const`-friendly so
184/// grammar code can build sets at compile time.
185#[derive(Copy, Clone, Debug)]
186pub(crate) struct TokenSet([u128; TOKEN_SET_WORDS]);
187
188const TOKEN_SET_WORDS: usize = 8;
189#[allow(clippy::cast_possible_truncation)]
190const TOKEN_SET_CAPACITY: u16 = (TOKEN_SET_WORDS as u16) * 128;
191
192impl TokenSet {
193 pub(crate) const EMPTY: TokenSet = TokenSet([0; TOKEN_SET_WORDS]);
194
195 pub(crate) const fn new(kinds: &[SyntaxKind]) -> Self {
196 let mut mask = [0u128; TOKEN_SET_WORDS];
197 let mut i = 0;
198 while i < kinds.len() {
199 let raw = kinds[i] as u16;
200 assert!(raw < TOKEN_SET_CAPACITY, "TokenSet: kind out of range");
201 let word = (raw / 128) as usize;
202 let bit = raw % 128;
203 mask[word] |= 1u128 << bit;
204 i += 1;
205 }
206 TokenSet(mask)
207 }
208
209 pub(crate) const fn union(self, other: TokenSet) -> TokenSet {
210 let mut out = [0u128; TOKEN_SET_WORDS];
211 let mut i = 0;
212 while i < TOKEN_SET_WORDS {
213 out[i] = self.0[i] | other.0[i];
214 i += 1;
215 }
216 TokenSet(out)
217 }
218
219 pub(crate) const fn contains(self, kind: SyntaxKind) -> bool {
220 let raw = kind as u16;
221 if raw >= TOKEN_SET_CAPACITY {
222 return false;
223 }
224 let word = (raw / 128) as usize;
225 let bit = raw % 128;
226 (self.0[word] & (1u128 << bit)) != 0
227 }
228}
229
230impl Default for TokenSet {
231 fn default() -> Self {
232 TokenSet::EMPTY
233 }
234}
235
236/// The recovering parser. Owns the token slice plus an in-flight event
237/// buffer. A non-trivia cursor jumps over whitespace/comment tokens so
238/// grammar code never sees them — the builder reassembles trivia when
239/// consuming events.
240pub(crate) struct Parser<'a> {
241 tokens: &'a [LexToken],
242 /// Position in `tokens` — the index of the next *significant* token.
243 /// Trivia between significant tokens is implicitly skipped.
244 pos: usize,
245 events: Vec<Event>,
246}
247
248impl<'a> Parser<'a> {
249 pub(crate) fn new(tokens: &'a [LexToken]) -> Self {
250 Self {
251 tokens,
252 pos: skip_trivia(tokens, 0),
253 events: Vec::new(),
254 }
255 }
256
257 /// Current significant token kind, or `EOF` if exhausted.
258 pub(crate) fn current(&self) -> SyntaxKind {
259 self.tokens
260 .get(self.pos)
261 .map_or(SyntaxKind::EOF, |t| t.kind)
262 }
263
264 /// Look ahead `n` significant tokens. `nth(0) == current()`.
265 #[allow(dead_code)]
266 pub(crate) fn nth(&self, n: usize) -> SyntaxKind {
267 let mut idx = self.pos;
268 let mut remaining = n;
269 loop {
270 if idx >= self.tokens.len() {
271 return SyntaxKind::EOF;
272 }
273 if self.tokens[idx].kind.is_trivia() {
274 idx += 1;
275 continue;
276 }
277 if remaining == 0 {
278 return self.tokens[idx].kind;
279 }
280 remaining -= 1;
281 idx += 1;
282 }
283 }
284
285 pub(crate) fn at(&self, kind: SyntaxKind) -> bool {
286 self.current() == kind
287 }
288
289 pub(crate) fn at_ts(&self, set: TokenSet) -> bool {
290 set.contains(self.current())
291 }
292
293 /// Case-insensitive match against a contextual keyword spelled as an
294 /// identifier. Cypher allows unreserved identifiers (e.g. `WITH` when
295 /// scoped by context); for v1 we only need this for the composite
296 /// operators `STARTS WITH` / `ENDS WITH`, which are handled via the
297 /// already-tokenised `STARTS_KW` / `ENDS_KW` and thus do not need the
298 /// hook. Left here as a single call site in case later beads do.
299 #[allow(dead_code)]
300 pub(crate) fn at_contextual(&self, ident: &str) -> bool {
301 self.current() == SyntaxKind::IDENT
302 && self
303 .tokens
304 .get(self.pos)
305 .is_some_and(|t| t.text.eq_ignore_ascii_case(ident))
306 }
307
308 /// Byte offset of the current token (or end-of-input if exhausted).
309 fn current_offset(&self) -> text_size::TextSize {
310 self.tokens.get(self.pos).map_or_else(
311 || {
312 self.tokens
313 .last()
314 .map_or(text_size::TextSize::from(0), |t| t.range.end())
315 },
316 |t| t.range.start(),
317 )
318 }
319
320 /// Consume whatever token is current (including EOF-no-op) and emit it
321 /// as a Token event. Use sparingly — prefer `bump(kind)` when the kind
322 /// is known for the debug assertion.
323 pub(crate) fn bump_any(&mut self) {
324 let kind = self.current();
325 if kind == SyntaxKind::EOF {
326 return;
327 }
328 self.events.push(Event::Token { kind });
329 self.pos += 1;
330 self.pos = skip_trivia(self.tokens, self.pos);
331 }
332
333 /// Consume the current token, asserting it has the given kind in debug.
334 pub(crate) fn bump(&mut self, kind: SyntaxKind) {
335 debug_assert!(
336 self.at(kind),
337 "bump: expected {kind:?}, got {:?}",
338 self.current()
339 );
340 self.bump_any();
341 }
342
343 /// Consume `kind` if present; return whether it was.
344 pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool {
345 if self.at(kind) {
346 self.bump(kind);
347 true
348 } else {
349 false
350 }
351 }
352
353 /// Record a diagnostic at the current position without consuming.
354 ///
355 /// `code` is the numeric discriminant of the corresponding `DiagCode`
356 /// variant in `cyrs-diag` (e.g. `3` for `E0003`). Use the
357 /// `syntax_codes` constants in this module for clarity.
358 pub(crate) fn error_code(&mut self, code: u16, msg: impl Into<String>) {
359 let offset = self.current_offset();
360 self.events.push(Event::Error {
361 code,
362 msg: msg.into(),
363 offset,
364 });
365 }
366
367 /// Record a generic syntax error (E0001) at the current position.
368 ///
369 /// Prefer [`Parser::error_code`] with a specific code at every call
370 /// site that maps to a known error class.
371 pub(crate) fn error(&mut self, msg: impl Into<String>) {
372 self.error_code(syntax_codes::GENERIC_SYNTAX_ERROR, msg);
373 }
374
375 /// Consume one token and wrap it in an ERROR node alongside a message.
376 #[allow(dead_code)]
377 pub(crate) fn err_and_bump(&mut self, msg: impl Into<String>) {
378 let m = self.start();
379 self.error(msg);
380 self.bump_any();
381 m.complete(self, SyntaxKind::ERROR);
382 }
383
384 /// Require `kind`. If present, consume it; otherwise emit a diagnostic
385 /// at the expected position. The parser continues — the missing
386 /// closer acts as a zero-width `ERROR` in the event stream
387 /// (virtual-token insertion per spec §4.3).
388 #[allow(dead_code)]
389 pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool {
390 if self.eat(kind) {
391 true
392 } else {
393 self.error_code(syntax_codes::EXPECTED_TOKEN, format!("expected {kind:?}"));
394 false
395 }
396 }
397
398 /// Skip tokens until we hit one in `set`, an already-synchronising
399 /// clause keyword, a `;`, or EOF. Skipped tokens are wrapped in a
400 /// single `ERROR` node (spec §4.3 — synchronisation-set skip).
401 pub(crate) fn recover_until(&mut self, set: TokenSet) {
402 let stop = set.union(RECOVERY_STOP);
403 if stop.contains(self.current()) || self.current() == SyntaxKind::EOF {
404 return;
405 }
406 let m = self.start();
407 while !stop.contains(self.current()) && self.current() != SyntaxKind::EOF {
408 self.bump_any();
409 }
410 m.complete(self, SyntaxKind::ERROR);
411 }
412
413 /// Begin a new node. Returns a `Marker` that must be completed or
414 /// abandoned before it is dropped.
415 pub(crate) fn start(&mut self) -> Marker {
416 let pos = self.events.len();
417 self.events.push(Event::Start {
418 kind: SyntaxKind::ERROR, // placeholder; replaced by complete()
419 forward_parent: None,
420 });
421 Marker::new(pos)
422 }
423
424 /// Return the current token-cursor position. Used by the no-progress
425 /// guard in `source_file` to detect loops where no token is consumed.
426 pub(crate) fn position(&self) -> usize {
427 self.pos
428 }
429
430 pub(crate) fn into_events(self) -> Vec<Event> {
431 self.events
432 }
433}
434
435// ========================================================================
436// Syntax error code constants (spec §10.2, E0001–E0999)
437// ========================================================================
438//
439// These numeric values MUST match the discriminants of the corresponding
440// `DiagCode` enum variants in `cyrs-diag/src/codes.rs`. `cyrs-syntax`
441// cannot depend on `cyrs-diag` (§3.1), so the codes are mirrored here
442// as plain `u16` constants. CI enforces that each constant is actually
443// used at a call site and that its value exists in the registry.
444
445pub(crate) mod syntax_codes {
446 /// E0001 — generic / unclassified syntax error.
447 pub(crate) const GENERIC_SYNTAX_ERROR: u16 = 1;
448 /// E0002 — unexpected token.
449 pub(crate) const UNEXPECTED_TOKEN: u16 = 2;
450 /// E0003 — expected `<token>`, found `<token>`.
451 pub(crate) const EXPECTED_TOKEN: u16 = 3;
452 /// E0004 — unclosed string literal.
453 pub(crate) const UNCLOSED_STRING: u16 = 4;
454 /// E0005 — unclosed block comment.
455 pub(crate) const UNCLOSED_BLOCK_COMMENT: u16 = 5;
456 /// E0006 — invalid numeric literal.
457 pub(crate) const INVALID_NUMERIC_LITERAL: u16 = 6;
458 /// E0007 — expected statement.
459 pub(crate) const EXPECTED_STATEMENT: u16 = 7;
460 /// E0008 — expected `;` or end of input.
461 pub(crate) const EXPECTED_SEMICOLON_OR_EOF: u16 = 8;
462 /// E0009 — expected `(` to start a node pattern.
463 pub(crate) const EXPECTED_LPAREN_NODE: u16 = 9;
464 /// E0010 — expected node pattern after relationship.
465 pub(crate) const EXPECTED_NODE_AFTER_REL: u16 = 10;
466 /// E0011 — expected `)` to close node pattern.
467 pub(crate) const EXPECTED_RPAREN_NODE: u16 = 11;
468 /// E0012 — expected `-` at relationship start.
469 pub(crate) const EXPECTED_DASH_REL_START: u16 = 12;
470 /// E0013 — expected `-` to close relationship.
471 pub(crate) const EXPECTED_DASH_REL_CLOSE: u16 = 13;
472 /// E0014 — expected `-` or `->` to close relationship.
473 pub(crate) const EXPECTED_DASH_OR_ARROW: u16 = 14;
474 /// E0015 — expected `]` to close relationship detail.
475 pub(crate) const EXPECTED_RBRACK_REL: u16 = 15;
476 /// E0016 — expected label after `:`.
477 pub(crate) const EXPECTED_LABEL: u16 = 16;
478 /// E0017 — expected relationship type after `:`.
479 pub(crate) const EXPECTED_REL_TYPE: u16 = 17;
480 /// E0018 — expected `}` to close property map.
481 pub(crate) const EXPECTED_RBRACE_PROP: u16 = 18;
482 /// E0019 — expected property key.
483 pub(crate) const EXPECTED_PROP_KEY: u16 = 19;
484 /// E0020 — expected `:` in property entry.
485 pub(crate) const EXPECTED_COLON_PROP: u16 = 20;
486 /// E0021 — expected expression for property value.
487 pub(crate) const EXPECTED_PROP_VALUE: u16 = 21;
488 /// E0022 — expected identifier.
489 pub(crate) const EXPECTED_IDENT: u16 = 22;
490 /// E0023 — expression nesting limit exceeded.
491 pub(crate) const EXPR_NESTING_LIMIT: u16 = 23;
492 /// E0024 — expected operand after unary operator.
493 pub(crate) const EXPECTED_UNARY_OPERAND: u16 = 24;
494 /// E0025 — expected `NULL` after `IS`.
495 pub(crate) const EXPECTED_NULL_AFTER_IS: u16 = 25;
496 /// E0026 — expected right-hand side of binary expression.
497 pub(crate) const EXPECTED_BINOP_RHS: u16 = 26;
498 /// E0027 — expected expression inside parentheses.
499 pub(crate) const EXPECTED_EXPR_IN_PARENS: u16 = 27;
500 /// E0028 — expected `)` to close expression.
501 pub(crate) const EXPECTED_RPAREN_EXPR: u16 = 28;
502 /// E0029 — expected `WITH` after `STARTS`.
503 pub(crate) const EXPECTED_WITH_AFTER_STARTS: u16 = 29;
504 /// E0030 — expected `WITH` after `ENDS`.
505 pub(crate) const EXPECTED_WITH_AFTER_ENDS: u16 = 30;
506 /// E0031 — expected property key after `.`.
507 pub(crate) const EXPECTED_PROP_KEY_AFTER_DOT: u16 = 31;
508 /// E0032 — expected index expression.
509 pub(crate) const EXPECTED_INDEX_EXPR: u16 = 32;
510 /// E0033 — expected `]` to close index expression.
511 pub(crate) const EXPECTED_RBRACK_INDEX: u16 = 33;
512 /// E0034 — expected `)` to close function call.
513 pub(crate) const EXPECTED_RPAREN_CALL: u16 = 34;
514 /// E0035 — expected function argument.
515 pub(crate) const EXPECTED_CALL_ARG: u16 = 35;
516 /// E0036 — expected expression in `RETURN` item.
517 pub(crate) const EXPECTED_RETURN_EXPR: u16 = 36;
518 /// E0037 — expected identifier after `AS`.
519 pub(crate) const EXPECTED_IDENT_AFTER_AS: u16 = 37;
520 /// E0038 — expected `BY` after `ORDER`.
521 pub(crate) const EXPECTED_BY_AFTER_ORDER: u16 = 38;
522 /// E0039 — expected expression in `ORDER BY`.
523 pub(crate) const EXPECTED_ORDERBY_EXPR: u16 = 39;
524 /// E0040 — expected expression after `SKIP`.
525 pub(crate) const EXPECTED_SKIP_EXPR: u16 = 40;
526 /// E0041 — expected expression after `LIMIT`.
527 pub(crate) const EXPECTED_LIMIT_EXPR: u16 = 41;
528 /// E0042 — expected `MATCH` after `OPTIONAL`.
529 pub(crate) const EXPECTED_MATCH_AFTER_OPTIONAL: u16 = 42;
530 /// E0043 — expected expression after `WHERE`.
531 pub(crate) const EXPECTED_WHERE_EXPR: u16 = 43;
532 /// E0044 — clause not yet implemented (deferred construct).
533 /// E0044 — was the cy-nom-era "unimplemented clause" stub error
534 /// emitted by `deferred_clause_stub`. cy-4mg landed CALL — the last
535 /// previously-deferred clause — so the helper is gone. Code number
536 /// retained to preserve the diagnostic registry's monotonic numbering.
537 #[allow(dead_code)]
538 pub(crate) const UNIMPLEMENTED_CLAUSE_RESERVED: u16 = 44;
539 /// E0045 — expected a clause keyword.
540 pub(crate) const EXPECTED_CLAUSE: u16 = 45;
541 /// E0046 — invalid escape sequence in string literal.
542 pub(crate) const INVALID_ESCAPE: u16 = 46;
543 /// E0047 — expected expression in list literal.
544 pub(crate) const EXPECTED_LIST_ELEM: u16 = 47;
545 /// E0048 — expected `]` to close list literal.
546 pub(crate) const EXPECTED_RBRACK_LIST: u16 = 48;
547 /// E0049 — expected key in map literal.
548 pub(crate) const EXPECTED_MAP_KEY: u16 = 49;
549 /// E0050 — expected `:` in map entry.
550 pub(crate) const EXPECTED_COLON_MAP: u16 = 50;
551 /// E0051 — expected expression for map value.
552 pub(crate) const EXPECTED_MAP_VALUE: u16 = 51;
553 /// E0052 — expected `}` to close map literal.
554 pub(crate) const EXPECTED_RBRACE_MAP: u16 = 52;
555 /// E0053 — expected expression after `UNWIND`.
556 pub(crate) const EXPECTED_UNWIND_EXPR: u16 = 53;
557 /// E0054 — expected `AS` after `UNWIND` expression.
558 pub(crate) const EXPECTED_AS_UNWIND: u16 = 54;
559 /// E0055 — expected pattern after `CREATE`.
560 pub(crate) const EXPECTED_CREATE_PATTERN: u16 = 55;
561 /// E0056 — expected pattern after `MERGE`.
562 pub(crate) const EXPECTED_MERGE_PATTERN: u16 = 56;
563 /// E0057 — expected SET item (property assignment or label add).
564 pub(crate) const EXPECTED_SET_ITEM: u16 = 57;
565 /// E0058 — expected REMOVE item (property or label).
566 pub(crate) const EXPECTED_REMOVE_ITEM: u16 = 58;
567 /// E0059 — expected expression after `DELETE`.
568 pub(crate) const EXPECTED_DELETE_EXPR: u16 = 59;
569 /// E0060 — expected `DELETE` after `DETACH`.
570 pub(crate) const EXPECTED_DELETE_AFTER_DETACH: u16 = 60;
571 /// E0061 — expected `CREATE` or `MATCH` after `ON` in MERGE action.
572 pub(crate) const EXPECTED_ON_ACTION: u16 = 61;
573 /// E0062 — expected `]` to close variable-length hop.
574 #[allow(dead_code)]
575 pub(crate) const EXPECTED_RBRACK_HOP: u16 = 62;
576 /// E0063 — expected `=` after path binder identifier.
577 #[allow(dead_code)]
578 pub(crate) const EXPECTED_EQ_PATH_BIND: u16 = 63;
579 /// E0064 — expected `]` to close an indexing / slicing bracket.
580 ///
581 /// Distinct from `EXPECTED_RBRACK_INDEX` (E0033) which covers the
582 /// legacy `SUBSCRIPT_EXPR` path: E0064 is emitted by the typed
583 /// index / slice recovery introduced in cy-7s6.1.
584 pub(crate) const UNCLOSED_INDEX_BRACKET: u16 = 64;
585 /// E0065 — expected `(` after ANY / ALL / NONE / SINGLE keyword to
586 /// begin the list-predicate arg list (cy-8x5).
587 pub(crate) const EXPECTED_LPAREN_LIST_PREDICATE: u16 = 65;
588 /// E0066 — expected `IN` between the list-predicate binder and its
589 /// source expression (cy-8x5).
590 pub(crate) const EXPECTED_IN_LIST_PREDICATE: u16 = 66;
591 /// E0067 — expected `)` to close a list-predicate expression
592 /// (cy-8x5).
593 pub(crate) const EXPECTED_RPAREN_LIST_PREDICATE: u16 = 67;
594 /// E0068 — expected `IN` in list comprehension.
595 ///
596 /// Emitted by the list-comprehension parse path when the first
597 /// token inside `[` is an identifier but is not followed by the
598 /// `IN` keyword — cy-5gh.
599 pub(crate) const EXPECTED_IN_LIST_COMP: u16 = 68;
600 /// E0069 — expected `|` or `]` in list comprehension.
601 ///
602 /// Emitted when a list comprehension's trailing segment is
603 /// malformed — either a missing `|` before the projection
604 /// expression or a missing `]` closer — cy-5gh.
605 pub(crate) const EXPECTED_PIPE_OR_RBRACK_LIST_COMP: u16 = 69;
606 /// E0070 — expected `THEN` in CASE arm.
607 ///
608 /// Emitted by the parser's `CASE` production when a `WHEN <value>`
609 /// clause is not followed by the mandatory `THEN` keyword — cy-41u.
610 pub(crate) const EXPECTED_THEN_CASE: u16 = 70;
611 /// E0071 — expected `END` to close a CASE expression.
612 ///
613 /// Emitted by the parser's `CASE` production when the terminating
614 /// `END` keyword is missing after the final arm / optional `ELSE`
615 /// branch — cy-41u.
616 pub(crate) const EXPECTED_END_CASE: u16 = 71;
617 /// E0072 — expected `)` to close an `EXISTS(<pattern>)` pattern
618 /// predicate (cy-lve, spec §6.1 / §19 row "Pattern predicates in
619 /// expressions").
620 pub(crate) const EXPECTED_RPAREN_EXISTS: u16 = 72;
621 /// E0073 — expected procedure name after `CALL` (cy-4mg, spec §14 /
622 /// §19 row "CALL <proc> YIELD ..."). The procedure-name production
623 /// is `IDENT ('.' IDENT)*`; emitted when the token after `CALL` is
624 /// not an identifier.
625 pub(crate) const EXPECTED_PROCEDURE_NAME: u16 = 73;
626 /// E0074 — expected `)` to close the argument list of a CALL
627 /// invocation (cy-4mg, spec §14).
628 pub(crate) const EXPECTED_RPAREN_CALL_ARGS: u16 = 74;
629 /// E0075 — expected identifier in a YIELD item (cy-4mg, spec §14 /
630 /// ungrammar `YieldItem = NameRef ('AS' NameDef)?`).
631 pub(crate) const EXPECTED_YIELD_ITEM: u16 = 75;
632 /// E0077 — expected `)` to close a `shortestPath` /
633 /// `allShortestPaths` pattern function argument (cy-b5b, spec §6.4 /
634 /// §19 row "shortest-path"). E0076 reserved for `CALL { ... }`
635 /// block subquery (cy-4mg follow-up — block form deferred per §20 D1).
636 pub(crate) const EXPECTED_RPAREN_SHORTEST_PATH: u16 = 77;
637 /// E0078 — expected `}` to close a map projection (cy-01q, spec §6.1 /
638 /// §19 row "Map projection"). Distinct from E0052 (map literal close)
639 /// so tooling can tell projection from literal recovery apart.
640 /// Renumbered from E0073 to avoid collision with cy-4mg.
641 pub(crate) const EXPECTED_RBRACE_MAP_PROJECTION: u16 = 78;
642 /// E0079 — expected property name or `*` after `.` in a map projection
643 /// item (cy-01q). The `.` opens either `.NAME` (property selector) or
644 /// `.*` (all-properties spread); anything else is an error.
645 /// Renumbered from E0074.
646 pub(crate) const EXPECTED_PROP_OR_STAR_AFTER_DOT_IN_PROJECTION: u16 = 79;
647 /// E0080 — expected `:` in a map projection literal item (cy-01q).
648 /// Distinct from E0050 (map literal `:`) for the same reason as
649 /// E0078 vs E0052. Renumbered from E0075.
650 pub(crate) const EXPECTED_COLON_MAP_PROJECTION: u16 = 80;
651 /// E0081 — expected expression for map projection value (cy-01q).
652 /// Renumbered from E0076.
653 pub(crate) const EXPECTED_VALUE_MAP_PROJECTION: u16 = 81;
654 /// E0082 — expected an item in map projection: `.name`, `key: expr`,
655 /// `.*`, or `*` (cy-01q). Renumbered from E0077.
656 pub(crate) const EXPECTED_MAP_PROJECTION_ITEM: u16 = 82;
657
658 // ---- dialect gates (E4xxx, shared with cyrs-diag::codes) -----------
659 // The `error_code` payload is the numeric part of a `DiagCode`
660 // variant. Dialect-gate codes live in `crates/cyrs-diag/src/codes.rs`
661 // and are referenced here as `u16` constants so the parser can emit
662 // them without a cross-crate dependency.
663
664 /// E4017 — `EXISTS { <subquery> }` block form is deferred per spec
665 /// §9.3 / §19 / §20 D1 (cy-lve, tranche A). Registered in
666 /// `cyrs-diag::codes` (`DiagCode::E4017`, `exists_subquery`
667 /// dialect gate). Emitted by the parser when it encounters
668 /// `EXISTS {` in an atom position.
669 pub(crate) const EXISTS_BLOCK_DEFERRED: u16 = 4017;
670}
671
672/// Advance `idx` past any leading trivia tokens.
673fn skip_trivia(tokens: &[LexToken], mut idx: usize) -> usize {
674 while idx < tokens.len() && tokens[idx].kind.is_trivia() {
675 idx += 1;
676 }
677 idx
678}
679
680/// Clause-start keywords + statement terminators that always end recovery.
681/// Any clause recovery set is implicitly unioned with this.
682pub(crate) const RECOVERY_STOP: TokenSet = TokenSet::new(&[
683 SyntaxKind::MATCH_KW,
684 SyntaxKind::OPTIONAL_KW,
685 SyntaxKind::WHERE_KW,
686 SyntaxKind::WITH_KW,
687 SyntaxKind::RETURN_KW,
688 SyntaxKind::CREATE_KW,
689 SyntaxKind::MERGE_KW,
690 SyntaxKind::SET_KW,
691 SyntaxKind::REMOVE_KW,
692 SyntaxKind::DELETE_KW,
693 SyntaxKind::DETACH_KW,
694 SyntaxKind::UNWIND_KW,
695 SyntaxKind::CALL_KW,
696 SyntaxKind::UNION_KW,
697 SyntaxKind::SEMI,
698]);
699
700// ========================================================================
701// Marker / CompletedMarker (drop-bomb protected)
702// ========================================================================
703
704/// A pending node-open. Must be either `complete`d or `abandon`ed before
705/// it is dropped. The [`DropBomb`] panics in debug builds if dropped.
706#[must_use = "Marker must be completed or abandoned"]
707pub(crate) struct Marker {
708 pos: u32,
709 bomb: DropBomb,
710}
711
712impl Marker {
713 fn new(pos: usize) -> Self {
714 Self {
715 pos: u32::try_from(pos).expect("event index fits u32"),
716 bomb: DropBomb::new("Marker must be completed or abandoned"),
717 }
718 }
719
720 /// Close the node with the given kind, yielding a `CompletedMarker`
721 /// that can be used for Pratt-style re-parenting via [`CompletedMarker::precede`].
722 pub(crate) fn complete(mut self, p: &mut Parser<'_>, kind: SyntaxKind) -> CompletedMarker {
723 self.bomb.defuse();
724 let idx = self.pos as usize;
725 match &mut p.events[idx] {
726 Event::Start { kind: slot, .. } => *slot = kind,
727 _ => unreachable!("marker index does not point to a Start event"),
728 }
729 p.events.push(Event::Finish);
730 CompletedMarker {
731 pos: self.pos,
732 kind,
733 }
734 }
735
736 /// Drop the speculative node. If no events were emitted after the
737 /// marker's Start, we truncate; otherwise we tombstone the Start.
738 #[allow(dead_code)]
739 pub(crate) fn abandon(mut self, p: &mut Parser<'_>) {
740 self.bomb.defuse();
741 let idx = self.pos as usize;
742 if idx + 1 == p.events.len() {
743 p.events.pop();
744 } else {
745 p.events[idx] = Event::Abandoned;
746 }
747 }
748}
749
750/// A completed node handle. Can be retroactively re-parented via
751/// [`CompletedMarker::precede`] — central to Pratt expression parsing.
752#[derive(Debug, Copy, Clone)]
753pub(crate) struct CompletedMarker {
754 pos: u32,
755 #[allow(dead_code)]
756 kind: SyntaxKind,
757}
758
759impl CompletedMarker {
760 /// Open a new outer node that will re-parent this completed node. The
761 /// returned `Marker` must be completed; the outer node's Start/Finish
762 /// will surround this node's Start/Finish in the emitted tree.
763 pub(crate) fn precede(self, p: &mut Parser<'_>) -> Marker {
764 let new_marker = p.start();
765 // Link this completed Start's forward_parent to the new marker.
766 let old_idx = self.pos as usize;
767 let new_idx = new_marker.pos as usize;
768 let delta = u32::try_from(new_idx - old_idx).expect("forward-parent delta fits u32");
769 match &mut p.events[old_idx] {
770 Event::Start { forward_parent, .. } => {
771 debug_assert!(forward_parent.is_none(), "forward_parent already set");
772 *forward_parent = Some(delta);
773 }
774 _ => unreachable!("CompletedMarker pos does not point to a Start"),
775 }
776 new_marker
777 }
778}
779
780#[cfg(test)]
781mod tests {
782 use super::parse;
783 use crate::SyntaxKind;
784
785 #[test]
786 fn parse_empty_is_lossless() {
787 let p = parse("");
788 assert_eq!(p.syntax().to_string(), "");
789 }
790
791 #[test]
792 fn parse_simple_is_lossless() {
793 let src = "MATCH (n) RETURN n";
794 let p = parse(src);
795 assert_eq!(p.syntax().to_string(), src);
796 }
797
798 #[test]
799 fn parse_with_comments_is_lossless() {
800 let src = "// leading\nMATCH (n) /* inline */ RETURN n";
801 let p = parse(src);
802 assert_eq!(p.syntax().to_string(), src);
803 }
804
805 #[test]
806 fn root_kind_is_source_file() {
807 let p = parse("MATCH (n) RETURN n");
808 assert_eq!(p.syntax().kind(), SyntaxKind::SOURCE_FILE);
809 }
810
811 use proptest::prelude::*;
812
813 proptest! {
814 /// Property P17.3.2 — lossless CST over arbitrary UTF-8 input.
815 #[test]
816 fn lossless_on_arbitrary_utf8(s in ".*") {
817 let p = parse(&s);
818 prop_assert_eq!(p.syntax().to_string(), s);
819 }
820
821 /// Property P17.3.1 — parser totality: no panic, bounded time.
822 #[test]
823 fn parser_is_total_on_random_utf8(s in ".*") {
824 let _ = parse(&s);
825 }
826 }
827}