Skip to main content

cyrs_syntax/
parser.rs

1//! Event-based recovering parser (spec 0001 §4.2, §4.3, §4.4, §4.6).
2//!
3//! # Architecture
4//!
5//! The parser is a hand-written recursive-descent engine that emits an
6//! `Event` stream rather than directly mutating a tree. A separate
7//! `SyntaxTreeBuilder` — living in the internal `builder` module —
8//! consumes the events and produces the [`rowan`] green tree. This split
9//! lets the parser:
10//!
11//! - rewrite events for associativity fixing in Pratt expression parsing
12//!   (the `Start` event can be retroactively relocated via the
13//!   `Marker::precede` mechanism);
14//! - group or suppress spurious errors before they reach diagnostics;
15//! - interleave trivia tokens (whitespace, comments) between significant
16//!   tokens so the constructed tree is byte-lossless per §4.4.
17//!
18//! # Error recovery
19//!
20//! Per spec §4.3 the recovering parser exposes two primitives the grammar
21//! builds on:
22//!
23//! - **Synchronisation-set skip** (`Parser::recover_until`): when a
24//!   clause fails, tokens are bumped into an `ERROR` node until we see a
25//!   clause-level keyword, a `;`, or EOF.
26//! - **Virtual-token insertion** (`Parser::expect`): when a required
27//!   closer/keyword is missing, an `ERROR` node of zero width is recorded
28//!   at the expected position; the parser continues.
29//!
30//! Both primitives are shared across productions; per-production recovery
31//! tables (spec §4.3) land in a later bead (`cy-2vh`).
32//!
33//! # Losslessness invariant
34//!
35//! Spec §4.4: `parse(s).syntax().to_string() == s` for every input. Trivia
36//! and `ERROR` tokens are preserved in the tree; the property test
37//! `lossless_on_arbitrary_utf8` enforces the invariant.
38
39use drop_bomb::DropBomb;
40use rowan::GreenNode;
41use thiserror::Error;
42
43use crate::{
44    SyntaxKind, SyntaxNode,
45    builder::SyntaxTreeBuilder,
46    lexer::{LexToken, lex},
47};
48
49/// A completed parse: green tree + accumulated syntax errors.
50#[derive(Debug, Clone)]
51pub struct Parse {
52    green: GreenNode,
53    errors: Vec<SyntaxError>,
54}
55
56impl Parse {
57    /// Root `SyntaxNode` view of the parsed tree.  Cheap to clone —
58    /// the underlying `GreenNode` is ref-counted.
59    #[must_use]
60    pub fn syntax(&self) -> SyntaxNode {
61        SyntaxNode::new_root(self.green.clone())
62    }
63
64    /// Raw rowan green tree backing the parse.  Usually
65    /// `syntax()` is what you want; `green()` is exposed for
66    /// callers that want to share the tree by reference.
67    #[must_use]
68    pub fn green(&self) -> &GreenNode {
69        &self.green
70    }
71
72    /// Accumulated syntax errors in source order.  Empty when the
73    /// parse was fully clean.
74    #[must_use]
75    pub fn errors(&self) -> &[SyntaxError] {
76        &self.errors
77    }
78
79    /// Construct a `Parse` from raw parts. Crate-private — used by the
80    /// incremental smart-path splicer (cy-li5) which builds a new green
81    /// tree via `replace_with` and supplies a re-derived error list.
82    #[cfg(feature = "incremental")]
83    pub(crate) fn from_parts(green: GreenNode, errors: Vec<SyntaxError>) -> Self {
84        Parse { green, errors }
85    }
86}
87
88/// Entry point. Parses a complete source file.
89///
90/// Invariants:
91/// - `parse(src).syntax().to_string() == src` for every input (spec §4.4).
92/// - Never panics on any UTF-8 input (spec §17.3 P17.3.1).
93#[must_use]
94pub fn parse(src: &str) -> Parse {
95    let tokens = lex(src);
96    let mut parser = Parser::new(&tokens);
97    crate::grammar::source_file(&mut parser);
98    let events = parser.into_events();
99    let (green, errors) = SyntaxTreeBuilder::build(&tokens, events);
100    Parse { green, errors }
101}
102
103/// A single syntax error record. Spec §4.3 + §10.
104///
105/// `cyrs-syntax` does not depend on `cyrs-diag` (§3.1), so this is a
106/// local stand-in carrying just the minimum a later pass needs to lift it
107/// into a full `Diagnostic`: a stable numeric code (matching the
108/// `DiagCode` discriminants in `cyrs-diag`), a human-readable message,
109/// and the byte offset at which it was emitted.
110///
111/// Embedders should not match on the raw `u16` `code` field — names
112/// survive renumbering, magic numbers do not (cy-emb3). When
113/// `cyrs-diag` is on the dependency graph (typically via `cyrs-db`),
114/// lift the numeric to the typed enum:
115///
116/// ```ignore
117/// use cyrs_diag::DiagCode;
118/// match DiagCode::from(err) {
119///     DiagCode::E0007 => { /* expected statement */ }
120///     _ => { /* other syntax codes */ }
121/// }
122/// ```
123#[derive(Debug, Clone, Error)]
124#[error("[E{code:04}] {message}")]
125pub struct SyntaxError {
126    /// Numeric value of the `DiagCode` discriminant (e.g. 3 for E0003).
127    pub code: u16,
128    /// Human-readable message (rustc-style lower-case initial, no
129    /// trailing period).
130    pub message: String,
131    /// Byte offset at which the error was emitted.  Typically the
132    /// position of the offending token.
133    pub offset: text_size::TextSize,
134}
135
136// ========================================================================
137// Event stream
138// ========================================================================
139
140/// Events are emitted by the parser and consumed by
141/// [`SyntaxTreeBuilder`](crate::builder::SyntaxTreeBuilder).
142///
143/// `Start` carries an optional forward-parent index implementing
144/// [`Marker::precede`] (re-parenting a started node inside a new outer
145/// node); [`Event::Abandoned`] retracts a speculatively-started node.
146///
147/// The event stream is flat: Start/Finish pairs define nesting, Token
148/// events are emitted in source order alongside trivia that the builder
149/// re-interleaves from the underlying token slice.
150#[derive(Debug)]
151pub(crate) enum Event {
152    Start {
153        kind: SyntaxKind,
154        forward_parent: Option<u32>,
155    },
156    Finish,
157    Token {
158        kind: SyntaxKind,
159    },
160    Error {
161        code: u16,
162        msg: String,
163        offset: text_size::TextSize,
164    },
165    Abandoned,
166}
167
168impl Event {
169    /// Placeholder used by the builder when it swap-removes an event
170    /// during forward-parent chain resolution.
171    pub(crate) fn tombstone() -> Self {
172        Event::Abandoned
173    }
174}
175
176// ========================================================================
177// Parser engine
178// ========================================================================
179
180/// Token-set bitmap used by `at_ts`/`recover_until`. Backed by an
181/// 8×u128 array so we can represent every [`SyntaxKind`] value in
182/// `0..1024` without collisions (the enum's high-water mark is
183/// [`SyntaxKind::EOF`] = 769). Every construction is `const`-friendly so
184/// grammar code can build sets at compile time.
185#[derive(Copy, Clone, Debug)]
186pub(crate) struct TokenSet([u128; TOKEN_SET_WORDS]);
187
188const TOKEN_SET_WORDS: usize = 8;
189#[allow(clippy::cast_possible_truncation)]
190const TOKEN_SET_CAPACITY: u16 = (TOKEN_SET_WORDS as u16) * 128;
191
192impl TokenSet {
193    pub(crate) const EMPTY: TokenSet = TokenSet([0; TOKEN_SET_WORDS]);
194
195    pub(crate) const fn new(kinds: &[SyntaxKind]) -> Self {
196        let mut mask = [0u128; TOKEN_SET_WORDS];
197        let mut i = 0;
198        while i < kinds.len() {
199            let raw = kinds[i] as u16;
200            assert!(raw < TOKEN_SET_CAPACITY, "TokenSet: kind out of range");
201            let word = (raw / 128) as usize;
202            let bit = raw % 128;
203            mask[word] |= 1u128 << bit;
204            i += 1;
205        }
206        TokenSet(mask)
207    }
208
209    pub(crate) const fn union(self, other: TokenSet) -> TokenSet {
210        let mut out = [0u128; TOKEN_SET_WORDS];
211        let mut i = 0;
212        while i < TOKEN_SET_WORDS {
213            out[i] = self.0[i] | other.0[i];
214            i += 1;
215        }
216        TokenSet(out)
217    }
218
219    pub(crate) const fn contains(self, kind: SyntaxKind) -> bool {
220        let raw = kind as u16;
221        if raw >= TOKEN_SET_CAPACITY {
222            return false;
223        }
224        let word = (raw / 128) as usize;
225        let bit = raw % 128;
226        (self.0[word] & (1u128 << bit)) != 0
227    }
228}
229
230impl Default for TokenSet {
231    fn default() -> Self {
232        TokenSet::EMPTY
233    }
234}
235
236/// The recovering parser. Owns the token slice plus an in-flight event
237/// buffer. A non-trivia cursor jumps over whitespace/comment tokens so
238/// grammar code never sees them — the builder reassembles trivia when
239/// consuming events.
240pub(crate) struct Parser<'a> {
241    tokens: &'a [LexToken],
242    /// Position in `tokens` — the index of the next *significant* token.
243    /// Trivia between significant tokens is implicitly skipped.
244    pos: usize,
245    events: Vec<Event>,
246}
247
248impl<'a> Parser<'a> {
249    pub(crate) fn new(tokens: &'a [LexToken]) -> Self {
250        Self {
251            tokens,
252            pos: skip_trivia(tokens, 0),
253            events: Vec::new(),
254        }
255    }
256
257    /// Current significant token kind, or `EOF` if exhausted.
258    pub(crate) fn current(&self) -> SyntaxKind {
259        self.tokens
260            .get(self.pos)
261            .map_or(SyntaxKind::EOF, |t| t.kind)
262    }
263
264    /// Look ahead `n` significant tokens. `nth(0) == current()`.
265    #[allow(dead_code)]
266    pub(crate) fn nth(&self, n: usize) -> SyntaxKind {
267        let mut idx = self.pos;
268        let mut remaining = n;
269        loop {
270            if idx >= self.tokens.len() {
271                return SyntaxKind::EOF;
272            }
273            if self.tokens[idx].kind.is_trivia() {
274                idx += 1;
275                continue;
276            }
277            if remaining == 0 {
278                return self.tokens[idx].kind;
279            }
280            remaining -= 1;
281            idx += 1;
282        }
283    }
284
285    pub(crate) fn at(&self, kind: SyntaxKind) -> bool {
286        self.current() == kind
287    }
288
289    pub(crate) fn at_ts(&self, set: TokenSet) -> bool {
290        set.contains(self.current())
291    }
292
293    /// Case-insensitive match against a contextual keyword spelled as an
294    /// identifier. Cypher allows unreserved identifiers (e.g. `WITH` when
295    /// scoped by context); for v1 we only need this for the composite
296    /// operators `STARTS WITH` / `ENDS WITH`, which are handled via the
297    /// already-tokenised `STARTS_KW` / `ENDS_KW` and thus do not need the
298    /// hook. Left here as a single call site in case later beads do.
299    #[allow(dead_code)]
300    pub(crate) fn at_contextual(&self, ident: &str) -> bool {
301        self.current() == SyntaxKind::IDENT
302            && self
303                .tokens
304                .get(self.pos)
305                .is_some_and(|t| t.text.eq_ignore_ascii_case(ident))
306    }
307
308    /// Byte offset of the current token (or end-of-input if exhausted).
309    fn current_offset(&self) -> text_size::TextSize {
310        self.tokens.get(self.pos).map_or_else(
311            || {
312                self.tokens
313                    .last()
314                    .map_or(text_size::TextSize::from(0), |t| t.range.end())
315            },
316            |t| t.range.start(),
317        )
318    }
319
320    /// Consume whatever token is current (including EOF-no-op) and emit it
321    /// as a Token event. Use sparingly — prefer `bump(kind)` when the kind
322    /// is known for the debug assertion.
323    pub(crate) fn bump_any(&mut self) {
324        let kind = self.current();
325        if kind == SyntaxKind::EOF {
326            return;
327        }
328        self.events.push(Event::Token { kind });
329        self.pos += 1;
330        self.pos = skip_trivia(self.tokens, self.pos);
331    }
332
333    /// Consume the current token, asserting it has the given kind in debug.
334    pub(crate) fn bump(&mut self, kind: SyntaxKind) {
335        debug_assert!(
336            self.at(kind),
337            "bump: expected {kind:?}, got {:?}",
338            self.current()
339        );
340        self.bump_any();
341    }
342
343    /// Consume `kind` if present; return whether it was.
344    pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool {
345        if self.at(kind) {
346            self.bump(kind);
347            true
348        } else {
349            false
350        }
351    }
352
353    /// Record a diagnostic at the current position without consuming.
354    ///
355    /// `code` is the numeric discriminant of the corresponding `DiagCode`
356    /// variant in `cyrs-diag` (e.g. `3` for `E0003`). Use the
357    /// `syntax_codes` constants in this module for clarity.
358    pub(crate) fn error_code(&mut self, code: u16, msg: impl Into<String>) {
359        let offset = self.current_offset();
360        self.events.push(Event::Error {
361            code,
362            msg: msg.into(),
363            offset,
364        });
365    }
366
367    /// Record a generic syntax error (E0001) at the current position.
368    ///
369    /// Prefer [`Parser::error_code`] with a specific code at every call
370    /// site that maps to a known error class.
371    pub(crate) fn error(&mut self, msg: impl Into<String>) {
372        self.error_code(syntax_codes::GENERIC_SYNTAX_ERROR, msg);
373    }
374
375    /// Consume one token and wrap it in an ERROR node alongside a message.
376    #[allow(dead_code)]
377    pub(crate) fn err_and_bump(&mut self, msg: impl Into<String>) {
378        let m = self.start();
379        self.error(msg);
380        self.bump_any();
381        m.complete(self, SyntaxKind::ERROR);
382    }
383
384    /// Require `kind`. If present, consume it; otherwise emit a diagnostic
385    /// at the expected position. The parser continues — the missing
386    /// closer acts as a zero-width `ERROR` in the event stream
387    /// (virtual-token insertion per spec §4.3).
388    #[allow(dead_code)]
389    pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool {
390        if self.eat(kind) {
391            true
392        } else {
393            self.error_code(syntax_codes::EXPECTED_TOKEN, format!("expected {kind:?}"));
394            false
395        }
396    }
397
398    /// Skip tokens until we hit one in `set`, an already-synchronising
399    /// clause keyword, a `;`, or EOF. Skipped tokens are wrapped in a
400    /// single `ERROR` node (spec §4.3 — synchronisation-set skip).
401    pub(crate) fn recover_until(&mut self, set: TokenSet) {
402        let stop = set.union(RECOVERY_STOP);
403        if stop.contains(self.current()) || self.current() == SyntaxKind::EOF {
404            return;
405        }
406        let m = self.start();
407        while !stop.contains(self.current()) && self.current() != SyntaxKind::EOF {
408            self.bump_any();
409        }
410        m.complete(self, SyntaxKind::ERROR);
411    }
412
413    /// Begin a new node. Returns a `Marker` that must be completed or
414    /// abandoned before it is dropped.
415    pub(crate) fn start(&mut self) -> Marker {
416        let pos = self.events.len();
417        self.events.push(Event::Start {
418            kind: SyntaxKind::ERROR, // placeholder; replaced by complete()
419            forward_parent: None,
420        });
421        Marker::new(pos)
422    }
423
424    /// Return the current token-cursor position. Used by the no-progress
425    /// guard in `source_file` to detect loops where no token is consumed.
426    pub(crate) fn position(&self) -> usize {
427        self.pos
428    }
429
430    pub(crate) fn into_events(self) -> Vec<Event> {
431        self.events
432    }
433}
434
435// ========================================================================
436// Syntax error code constants (spec §10.2, E0001–E0999)
437// ========================================================================
438//
439// These numeric values MUST match the discriminants of the corresponding
440// `DiagCode` enum variants in `cyrs-diag/src/codes.rs`.  `cyrs-syntax`
441// cannot depend on `cyrs-diag` (§3.1), so the codes are mirrored here
442// as plain `u16` constants.  CI enforces that each constant is actually
443// used at a call site and that its value exists in the registry.
444
445pub(crate) mod syntax_codes {
446    /// E0001 — generic / unclassified syntax error.
447    pub(crate) const GENERIC_SYNTAX_ERROR: u16 = 1;
448    /// E0002 — unexpected token.
449    pub(crate) const UNEXPECTED_TOKEN: u16 = 2;
450    /// E0003 — expected `<token>`, found `<token>`.
451    pub(crate) const EXPECTED_TOKEN: u16 = 3;
452    /// E0004 — unclosed string literal.
453    pub(crate) const UNCLOSED_STRING: u16 = 4;
454    /// E0005 — unclosed block comment.
455    pub(crate) const UNCLOSED_BLOCK_COMMENT: u16 = 5;
456    /// E0006 — invalid numeric literal.
457    pub(crate) const INVALID_NUMERIC_LITERAL: u16 = 6;
458    /// E0007 — expected statement.
459    pub(crate) const EXPECTED_STATEMENT: u16 = 7;
460    /// E0008 — expected `;` or end of input.
461    pub(crate) const EXPECTED_SEMICOLON_OR_EOF: u16 = 8;
462    /// E0009 — expected `(` to start a node pattern.
463    pub(crate) const EXPECTED_LPAREN_NODE: u16 = 9;
464    /// E0010 — expected node pattern after relationship.
465    pub(crate) const EXPECTED_NODE_AFTER_REL: u16 = 10;
466    /// E0011 — expected `)` to close node pattern.
467    pub(crate) const EXPECTED_RPAREN_NODE: u16 = 11;
468    /// E0012 — expected `-` at relationship start.
469    pub(crate) const EXPECTED_DASH_REL_START: u16 = 12;
470    /// E0013 — expected `-` to close relationship.
471    pub(crate) const EXPECTED_DASH_REL_CLOSE: u16 = 13;
472    /// E0014 — expected `-` or `->` to close relationship.
473    pub(crate) const EXPECTED_DASH_OR_ARROW: u16 = 14;
474    /// E0015 — expected `]` to close relationship detail.
475    pub(crate) const EXPECTED_RBRACK_REL: u16 = 15;
476    /// E0016 — expected label after `:`.
477    pub(crate) const EXPECTED_LABEL: u16 = 16;
478    /// E0017 — expected relationship type after `:`.
479    pub(crate) const EXPECTED_REL_TYPE: u16 = 17;
480    /// E0018 — expected `}` to close property map.
481    pub(crate) const EXPECTED_RBRACE_PROP: u16 = 18;
482    /// E0019 — expected property key.
483    pub(crate) const EXPECTED_PROP_KEY: u16 = 19;
484    /// E0020 — expected `:` in property entry.
485    pub(crate) const EXPECTED_COLON_PROP: u16 = 20;
486    /// E0021 — expected expression for property value.
487    pub(crate) const EXPECTED_PROP_VALUE: u16 = 21;
488    /// E0022 — expected identifier.
489    pub(crate) const EXPECTED_IDENT: u16 = 22;
490    /// E0023 — expression nesting limit exceeded.
491    pub(crate) const EXPR_NESTING_LIMIT: u16 = 23;
492    /// E0024 — expected operand after unary operator.
493    pub(crate) const EXPECTED_UNARY_OPERAND: u16 = 24;
494    /// E0025 — expected `NULL` after `IS`.
495    pub(crate) const EXPECTED_NULL_AFTER_IS: u16 = 25;
496    /// E0026 — expected right-hand side of binary expression.
497    pub(crate) const EXPECTED_BINOP_RHS: u16 = 26;
498    /// E0027 — expected expression inside parentheses.
499    pub(crate) const EXPECTED_EXPR_IN_PARENS: u16 = 27;
500    /// E0028 — expected `)` to close expression.
501    pub(crate) const EXPECTED_RPAREN_EXPR: u16 = 28;
502    /// E0029 — expected `WITH` after `STARTS`.
503    pub(crate) const EXPECTED_WITH_AFTER_STARTS: u16 = 29;
504    /// E0030 — expected `WITH` after `ENDS`.
505    pub(crate) const EXPECTED_WITH_AFTER_ENDS: u16 = 30;
506    /// E0031 — expected property key after `.`.
507    pub(crate) const EXPECTED_PROP_KEY_AFTER_DOT: u16 = 31;
508    /// E0032 — expected index expression.
509    pub(crate) const EXPECTED_INDEX_EXPR: u16 = 32;
510    /// E0033 — expected `]` to close index expression.
511    pub(crate) const EXPECTED_RBRACK_INDEX: u16 = 33;
512    /// E0034 — expected `)` to close function call.
513    pub(crate) const EXPECTED_RPAREN_CALL: u16 = 34;
514    /// E0035 — expected function argument.
515    pub(crate) const EXPECTED_CALL_ARG: u16 = 35;
516    /// E0036 — expected expression in `RETURN` item.
517    pub(crate) const EXPECTED_RETURN_EXPR: u16 = 36;
518    /// E0037 — expected identifier after `AS`.
519    pub(crate) const EXPECTED_IDENT_AFTER_AS: u16 = 37;
520    /// E0038 — expected `BY` after `ORDER`.
521    pub(crate) const EXPECTED_BY_AFTER_ORDER: u16 = 38;
522    /// E0039 — expected expression in `ORDER BY`.
523    pub(crate) const EXPECTED_ORDERBY_EXPR: u16 = 39;
524    /// E0040 — expected expression after `SKIP`.
525    pub(crate) const EXPECTED_SKIP_EXPR: u16 = 40;
526    /// E0041 — expected expression after `LIMIT`.
527    pub(crate) const EXPECTED_LIMIT_EXPR: u16 = 41;
528    /// E0042 — expected `MATCH` after `OPTIONAL`.
529    pub(crate) const EXPECTED_MATCH_AFTER_OPTIONAL: u16 = 42;
530    /// E0043 — expected expression after `WHERE`.
531    pub(crate) const EXPECTED_WHERE_EXPR: u16 = 43;
532    /// E0044 — clause not yet implemented (deferred construct).
533    /// E0044 — was the cy-nom-era "unimplemented clause" stub error
534    /// emitted by `deferred_clause_stub`. cy-4mg landed CALL — the last
535    /// previously-deferred clause — so the helper is gone. Code number
536    /// retained to preserve the diagnostic registry's monotonic numbering.
537    #[allow(dead_code)]
538    pub(crate) const UNIMPLEMENTED_CLAUSE_RESERVED: u16 = 44;
539    /// E0045 — expected a clause keyword.
540    pub(crate) const EXPECTED_CLAUSE: u16 = 45;
541    /// E0046 — invalid escape sequence in string literal.
542    pub(crate) const INVALID_ESCAPE: u16 = 46;
543    /// E0047 — expected expression in list literal.
544    pub(crate) const EXPECTED_LIST_ELEM: u16 = 47;
545    /// E0048 — expected `]` to close list literal.
546    pub(crate) const EXPECTED_RBRACK_LIST: u16 = 48;
547    /// E0049 — expected key in map literal.
548    pub(crate) const EXPECTED_MAP_KEY: u16 = 49;
549    /// E0050 — expected `:` in map entry.
550    pub(crate) const EXPECTED_COLON_MAP: u16 = 50;
551    /// E0051 — expected expression for map value.
552    pub(crate) const EXPECTED_MAP_VALUE: u16 = 51;
553    /// E0052 — expected `}` to close map literal.
554    pub(crate) const EXPECTED_RBRACE_MAP: u16 = 52;
555    /// E0053 — expected expression after `UNWIND`.
556    pub(crate) const EXPECTED_UNWIND_EXPR: u16 = 53;
557    /// E0054 — expected `AS` after `UNWIND` expression.
558    pub(crate) const EXPECTED_AS_UNWIND: u16 = 54;
559    /// E0055 — expected pattern after `CREATE`.
560    pub(crate) const EXPECTED_CREATE_PATTERN: u16 = 55;
561    /// E0056 — expected pattern after `MERGE`.
562    pub(crate) const EXPECTED_MERGE_PATTERN: u16 = 56;
563    /// E0057 — expected SET item (property assignment or label add).
564    pub(crate) const EXPECTED_SET_ITEM: u16 = 57;
565    /// E0058 — expected REMOVE item (property or label).
566    pub(crate) const EXPECTED_REMOVE_ITEM: u16 = 58;
567    /// E0059 — expected expression after `DELETE`.
568    pub(crate) const EXPECTED_DELETE_EXPR: u16 = 59;
569    /// E0060 — expected `DELETE` after `DETACH`.
570    pub(crate) const EXPECTED_DELETE_AFTER_DETACH: u16 = 60;
571    /// E0061 — expected `CREATE` or `MATCH` after `ON` in MERGE action.
572    pub(crate) const EXPECTED_ON_ACTION: u16 = 61;
573    /// E0062 — expected `]` to close variable-length hop.
574    #[allow(dead_code)]
575    pub(crate) const EXPECTED_RBRACK_HOP: u16 = 62;
576    /// E0063 — expected `=` after path binder identifier.
577    #[allow(dead_code)]
578    pub(crate) const EXPECTED_EQ_PATH_BIND: u16 = 63;
579    /// E0064 — expected `]` to close an indexing / slicing bracket.
580    ///
581    /// Distinct from `EXPECTED_RBRACK_INDEX` (E0033) which covers the
582    /// legacy `SUBSCRIPT_EXPR` path: E0064 is emitted by the typed
583    /// index / slice recovery introduced in cy-7s6.1.
584    pub(crate) const UNCLOSED_INDEX_BRACKET: u16 = 64;
585    /// E0065 — expected `(` after ANY / ALL / NONE / SINGLE keyword to
586    /// begin the list-predicate arg list (cy-8x5).
587    pub(crate) const EXPECTED_LPAREN_LIST_PREDICATE: u16 = 65;
588    /// E0066 — expected `IN` between the list-predicate binder and its
589    /// source expression (cy-8x5).
590    pub(crate) const EXPECTED_IN_LIST_PREDICATE: u16 = 66;
591    /// E0067 — expected `)` to close a list-predicate expression
592    /// (cy-8x5).
593    pub(crate) const EXPECTED_RPAREN_LIST_PREDICATE: u16 = 67;
594    /// E0068 — expected `IN` in list comprehension.
595    ///
596    /// Emitted by the list-comprehension parse path when the first
597    /// token inside `[` is an identifier but is not followed by the
598    /// `IN` keyword — cy-5gh.
599    pub(crate) const EXPECTED_IN_LIST_COMP: u16 = 68;
600    /// E0069 — expected `|` or `]` in list comprehension.
601    ///
602    /// Emitted when a list comprehension's trailing segment is
603    /// malformed — either a missing `|` before the projection
604    /// expression or a missing `]` closer — cy-5gh.
605    pub(crate) const EXPECTED_PIPE_OR_RBRACK_LIST_COMP: u16 = 69;
606    /// E0070 — expected `THEN` in CASE arm.
607    ///
608    /// Emitted by the parser's `CASE` production when a `WHEN <value>`
609    /// clause is not followed by the mandatory `THEN` keyword — cy-41u.
610    pub(crate) const EXPECTED_THEN_CASE: u16 = 70;
611    /// E0071 — expected `END` to close a CASE expression.
612    ///
613    /// Emitted by the parser's `CASE` production when the terminating
614    /// `END` keyword is missing after the final arm / optional `ELSE`
615    /// branch — cy-41u.
616    pub(crate) const EXPECTED_END_CASE: u16 = 71;
617    /// E0072 — expected `)` to close an `EXISTS(<pattern>)` pattern
618    /// predicate (cy-lve, spec §6.1 / §19 row "Pattern predicates in
619    /// expressions").
620    pub(crate) const EXPECTED_RPAREN_EXISTS: u16 = 72;
621    /// E0073 — expected procedure name after `CALL` (cy-4mg, spec §14 /
622    /// §19 row "CALL <proc> YIELD ..."). The procedure-name production
623    /// is `IDENT ('.' IDENT)*`; emitted when the token after `CALL` is
624    /// not an identifier.
625    pub(crate) const EXPECTED_PROCEDURE_NAME: u16 = 73;
626    /// E0074 — expected `)` to close the argument list of a CALL
627    /// invocation (cy-4mg, spec §14).
628    pub(crate) const EXPECTED_RPAREN_CALL_ARGS: u16 = 74;
629    /// E0075 — expected identifier in a YIELD item (cy-4mg, spec §14 /
630    /// ungrammar `YieldItem = NameRef ('AS' NameDef)?`).
631    pub(crate) const EXPECTED_YIELD_ITEM: u16 = 75;
632    /// E0077 — expected `)` to close a `shortestPath` /
633    /// `allShortestPaths` pattern function argument (cy-b5b, spec §6.4 /
634    /// §19 row "shortest-path"). E0076 reserved for `CALL { ... }`
635    /// block subquery (cy-4mg follow-up — block form deferred per §20 D1).
636    pub(crate) const EXPECTED_RPAREN_SHORTEST_PATH: u16 = 77;
637    /// E0078 — expected `}` to close a map projection (cy-01q, spec §6.1 /
638    /// §19 row "Map projection"). Distinct from E0052 (map literal close)
639    /// so tooling can tell projection from literal recovery apart.
640    /// Renumbered from E0073 to avoid collision with cy-4mg.
641    pub(crate) const EXPECTED_RBRACE_MAP_PROJECTION: u16 = 78;
642    /// E0079 — expected property name or `*` after `.` in a map projection
643    /// item (cy-01q). The `.` opens either `.NAME` (property selector) or
644    /// `.*` (all-properties spread); anything else is an error.
645    /// Renumbered from E0074.
646    pub(crate) const EXPECTED_PROP_OR_STAR_AFTER_DOT_IN_PROJECTION: u16 = 79;
647    /// E0080 — expected `:` in a map projection literal item (cy-01q).
648    /// Distinct from E0050 (map literal `:`) for the same reason as
649    /// E0078 vs E0052. Renumbered from E0075.
650    pub(crate) const EXPECTED_COLON_MAP_PROJECTION: u16 = 80;
651    /// E0081 — expected expression for map projection value (cy-01q).
652    /// Renumbered from E0076.
653    pub(crate) const EXPECTED_VALUE_MAP_PROJECTION: u16 = 81;
654    /// E0082 — expected an item in map projection: `.name`, `key: expr`,
655    /// `.*`, or `*` (cy-01q). Renumbered from E0077.
656    pub(crate) const EXPECTED_MAP_PROJECTION_ITEM: u16 = 82;
657
658    // ---- dialect gates (E4xxx, shared with cyrs-diag::codes) -----------
659    // The `error_code` payload is the numeric part of a `DiagCode`
660    // variant. Dialect-gate codes live in `crates/cyrs-diag/src/codes.rs`
661    // and are referenced here as `u16` constants so the parser can emit
662    // them without a cross-crate dependency.
663
664    /// E4017 — `EXISTS { <subquery> }` block form is deferred per spec
665    /// §9.3 / §19 / §20 D1 (cy-lve, tranche A). Registered in
666    /// `cyrs-diag::codes` (`DiagCode::E4017`, `exists_subquery`
667    /// dialect gate). Emitted by the parser when it encounters
668    /// `EXISTS {` in an atom position.
669    pub(crate) const EXISTS_BLOCK_DEFERRED: u16 = 4017;
670}
671
672/// Advance `idx` past any leading trivia tokens.
673fn skip_trivia(tokens: &[LexToken], mut idx: usize) -> usize {
674    while idx < tokens.len() && tokens[idx].kind.is_trivia() {
675        idx += 1;
676    }
677    idx
678}
679
680/// Clause-start keywords + statement terminators that always end recovery.
681/// Any clause recovery set is implicitly unioned with this.
682pub(crate) const RECOVERY_STOP: TokenSet = TokenSet::new(&[
683    SyntaxKind::MATCH_KW,
684    SyntaxKind::OPTIONAL_KW,
685    SyntaxKind::WHERE_KW,
686    SyntaxKind::WITH_KW,
687    SyntaxKind::RETURN_KW,
688    SyntaxKind::CREATE_KW,
689    SyntaxKind::MERGE_KW,
690    SyntaxKind::SET_KW,
691    SyntaxKind::REMOVE_KW,
692    SyntaxKind::DELETE_KW,
693    SyntaxKind::DETACH_KW,
694    SyntaxKind::UNWIND_KW,
695    SyntaxKind::CALL_KW,
696    SyntaxKind::UNION_KW,
697    SyntaxKind::SEMI,
698]);
699
700// ========================================================================
701// Marker / CompletedMarker (drop-bomb protected)
702// ========================================================================
703
704/// A pending node-open. Must be either `complete`d or `abandon`ed before
705/// it is dropped. The [`DropBomb`] panics in debug builds if dropped.
706#[must_use = "Marker must be completed or abandoned"]
707pub(crate) struct Marker {
708    pos: u32,
709    bomb: DropBomb,
710}
711
712impl Marker {
713    fn new(pos: usize) -> Self {
714        Self {
715            pos: u32::try_from(pos).expect("event index fits u32"),
716            bomb: DropBomb::new("Marker must be completed or abandoned"),
717        }
718    }
719
720    /// Close the node with the given kind, yielding a `CompletedMarker`
721    /// that can be used for Pratt-style re-parenting via [`CompletedMarker::precede`].
722    pub(crate) fn complete(mut self, p: &mut Parser<'_>, kind: SyntaxKind) -> CompletedMarker {
723        self.bomb.defuse();
724        let idx = self.pos as usize;
725        match &mut p.events[idx] {
726            Event::Start { kind: slot, .. } => *slot = kind,
727            _ => unreachable!("marker index does not point to a Start event"),
728        }
729        p.events.push(Event::Finish);
730        CompletedMarker {
731            pos: self.pos,
732            kind,
733        }
734    }
735
736    /// Drop the speculative node. If no events were emitted after the
737    /// marker's Start, we truncate; otherwise we tombstone the Start.
738    #[allow(dead_code)]
739    pub(crate) fn abandon(mut self, p: &mut Parser<'_>) {
740        self.bomb.defuse();
741        let idx = self.pos as usize;
742        if idx + 1 == p.events.len() {
743            p.events.pop();
744        } else {
745            p.events[idx] = Event::Abandoned;
746        }
747    }
748}
749
750/// A completed node handle. Can be retroactively re-parented via
751/// [`CompletedMarker::precede`] — central to Pratt expression parsing.
752#[derive(Debug, Copy, Clone)]
753pub(crate) struct CompletedMarker {
754    pos: u32,
755    #[allow(dead_code)]
756    kind: SyntaxKind,
757}
758
759impl CompletedMarker {
760    /// Open a new outer node that will re-parent this completed node. The
761    /// returned `Marker` must be completed; the outer node's Start/Finish
762    /// will surround this node's Start/Finish in the emitted tree.
763    pub(crate) fn precede(self, p: &mut Parser<'_>) -> Marker {
764        let new_marker = p.start();
765        // Link this completed Start's forward_parent to the new marker.
766        let old_idx = self.pos as usize;
767        let new_idx = new_marker.pos as usize;
768        let delta = u32::try_from(new_idx - old_idx).expect("forward-parent delta fits u32");
769        match &mut p.events[old_idx] {
770            Event::Start { forward_parent, .. } => {
771                debug_assert!(forward_parent.is_none(), "forward_parent already set");
772                *forward_parent = Some(delta);
773            }
774            _ => unreachable!("CompletedMarker pos does not point to a Start"),
775        }
776        new_marker
777    }
778}
779
780#[cfg(test)]
781mod tests {
782    use super::parse;
783    use crate::SyntaxKind;
784
785    #[test]
786    fn parse_empty_is_lossless() {
787        let p = parse("");
788        assert_eq!(p.syntax().to_string(), "");
789    }
790
791    #[test]
792    fn parse_simple_is_lossless() {
793        let src = "MATCH (n) RETURN n";
794        let p = parse(src);
795        assert_eq!(p.syntax().to_string(), src);
796    }
797
798    #[test]
799    fn parse_with_comments_is_lossless() {
800        let src = "// leading\nMATCH (n) /* inline */ RETURN n";
801        let p = parse(src);
802        assert_eq!(p.syntax().to_string(), src);
803    }
804
805    #[test]
806    fn root_kind_is_source_file() {
807        let p = parse("MATCH (n) RETURN n");
808        assert_eq!(p.syntax().kind(), SyntaxKind::SOURCE_FILE);
809    }
810
811    use proptest::prelude::*;
812
813    proptest! {
814        /// Property P17.3.2 — lossless CST over arbitrary UTF-8 input.
815        #[test]
816        fn lossless_on_arbitrary_utf8(s in ".*") {
817            let p = parse(&s);
818            prop_assert_eq!(p.syntax().to_string(), s);
819        }
820
821        /// Property P17.3.1 — parser totality: no panic, bounded time.
822        #[test]
823        fn parser_is_total_on_random_utf8(s in ".*") {
824            let _ = parse(&s);
825        }
826    }
827}