cyrs-syntax 0.1.0

Lossless CST and recovering parser for Cypher / GQL (spec 0001 §4).
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
//! Event-based recovering parser (spec 0001 §4.2, §4.3, §4.4, §4.6).
//!
//! # Architecture
//!
//! The parser is a hand-written recursive-descent engine that emits an
//! `Event` stream rather than directly mutating a tree. A separate
//! `SyntaxTreeBuilder` — living in the internal `builder` module —
//! consumes the events and produces the [`rowan`] green tree. This split
//! lets the parser:
//!
//! - rewrite events for associativity fixing in Pratt expression parsing
//!   (the `Start` event can be retroactively relocated via the
//!   `Marker::precede` mechanism);
//! - group or suppress spurious errors before they reach diagnostics;
//! - interleave trivia tokens (whitespace, comments) between significant
//!   tokens so the constructed tree is byte-lossless per §4.4.
//!
//! # Error recovery
//!
//! Per spec §4.3 the recovering parser exposes two primitives the grammar
//! builds on:
//!
//! - **Synchronisation-set skip** (`Parser::recover_until`): when a
//!   clause fails, tokens are bumped into an `ERROR` node until we see a
//!   clause-level keyword, a `;`, or EOF.
//! - **Virtual-token insertion** (`Parser::expect`): when a required
//!   closer/keyword is missing, an `ERROR` node of zero width is recorded
//!   at the expected position; the parser continues.
//!
//! Both primitives are shared across productions; per-production recovery
//! tables (spec §4.3) land in a later bead (`cy-2vh`).
//!
//! # Losslessness invariant
//!
//! Spec §4.4: `parse(s).syntax().to_string() == s` for every input. Trivia
//! and `ERROR` tokens are preserved in the tree; the property test
//! `lossless_on_arbitrary_utf8` enforces the invariant.

use drop_bomb::DropBomb;
use rowan::GreenNode;
use thiserror::Error;

use crate::{
    SyntaxKind, SyntaxNode,
    builder::SyntaxTreeBuilder,
    lexer::{LexToken, lex},
};

/// A completed parse: green tree + accumulated syntax errors.
#[derive(Debug, Clone)]
pub struct Parse {
    green: GreenNode,
    errors: Vec<SyntaxError>,
}

impl Parse {
    /// Root `SyntaxNode` view of the parsed tree.  Cheap to clone —
    /// the underlying `GreenNode` is ref-counted.
    #[must_use]
    pub fn syntax(&self) -> SyntaxNode {
        SyntaxNode::new_root(self.green.clone())
    }

    /// Raw rowan green tree backing the parse.  Usually
    /// `syntax()` is what you want; `green()` is exposed for
    /// callers that want to share the tree by reference.
    #[must_use]
    pub fn green(&self) -> &GreenNode {
        &self.green
    }

    /// Accumulated syntax errors in source order.  Empty when the
    /// parse was fully clean.
    #[must_use]
    pub fn errors(&self) -> &[SyntaxError] {
        &self.errors
    }

    /// Construct a `Parse` from raw parts. Crate-private — used by the
    /// incremental smart-path splicer (cy-li5) which builds a new green
    /// tree via `replace_with` and supplies a re-derived error list.
    #[cfg(feature = "incremental")]
    pub(crate) fn from_parts(green: GreenNode, errors: Vec<SyntaxError>) -> Self {
        Parse { green, errors }
    }
}

/// Entry point. Parses a complete source file.
///
/// Invariants:
/// - `parse(src).syntax().to_string() == src` for every input (spec §4.4).
/// - Never panics on any UTF-8 input (spec §17.3 P17.3.1).
#[must_use]
pub fn parse(src: &str) -> Parse {
    let tokens = lex(src);
    let mut parser = Parser::new(&tokens);
    crate::grammar::source_file(&mut parser);
    let events = parser.into_events();
    let (green, errors) = SyntaxTreeBuilder::build(&tokens, events);
    Parse { green, errors }
}

/// A single syntax error record. Spec §4.3 + §10.
///
/// `cyrs-syntax` does not depend on `cyrs-diag` (§3.1), so this is a
/// local stand-in carrying just the minimum a later pass needs to lift it
/// into a full `Diagnostic`: a stable numeric code (matching the
/// `DiagCode` discriminants in `cyrs-diag`), a human-readable message,
/// and the byte offset at which it was emitted.
///
/// Embedders should not match on the raw `u16` `code` field — names
/// survive renumbering, magic numbers do not (cy-emb3). When
/// `cyrs-diag` is on the dependency graph (typically via `cyrs-db`),
/// lift the numeric to the typed enum:
///
/// ```ignore
/// use cyrs_diag::DiagCode;
/// match DiagCode::from(err) {
///     DiagCode::E0007 => { /* expected statement */ }
///     _ => { /* other syntax codes */ }
/// }
/// ```
#[derive(Debug, Clone, Error)]
#[error("[E{code:04}] {message}")]
pub struct SyntaxError {
    /// Numeric value of the `DiagCode` discriminant (e.g. 3 for E0003).
    pub code: u16,
    /// Human-readable message (rustc-style lower-case initial, no
    /// trailing period).
    pub message: String,
    /// Byte offset at which the error was emitted.  Typically the
    /// position of the offending token.
    pub offset: text_size::TextSize,
}

// ========================================================================
// Event stream
// ========================================================================

/// Events are emitted by the parser and consumed by
/// [`SyntaxTreeBuilder`](crate::builder::SyntaxTreeBuilder).
///
/// `Start` carries an optional forward-parent index implementing
/// [`Marker::precede`] (re-parenting a started node inside a new outer
/// node); [`Event::Abandoned`] retracts a speculatively-started node.
///
/// The event stream is flat: Start/Finish pairs define nesting, Token
/// events are emitted in source order alongside trivia that the builder
/// re-interleaves from the underlying token slice.
#[derive(Debug)]
pub(crate) enum Event {
    Start {
        kind: SyntaxKind,
        forward_parent: Option<u32>,
    },
    Finish,
    Token {
        kind: SyntaxKind,
    },
    Error {
        code: u16,
        msg: String,
        offset: text_size::TextSize,
    },
    Abandoned,
}

impl Event {
    /// Placeholder used by the builder when it swap-removes an event
    /// during forward-parent chain resolution.
    pub(crate) fn tombstone() -> Self {
        Event::Abandoned
    }
}

// ========================================================================
// Parser engine
// ========================================================================

/// Token-set bitmap used by `at_ts`/`recover_until`. Backed by an
/// 8×u128 array so we can represent every [`SyntaxKind`] value in
/// `0..1024` without collisions (the enum's high-water mark is
/// [`SyntaxKind::EOF`] = 769). Every construction is `const`-friendly so
/// grammar code can build sets at compile time.
#[derive(Copy, Clone, Debug)]
pub(crate) struct TokenSet([u128; TOKEN_SET_WORDS]);

const TOKEN_SET_WORDS: usize = 8;
#[allow(clippy::cast_possible_truncation)]
const TOKEN_SET_CAPACITY: u16 = (TOKEN_SET_WORDS as u16) * 128;

impl TokenSet {
    pub(crate) const EMPTY: TokenSet = TokenSet([0; TOKEN_SET_WORDS]);

    pub(crate) const fn new(kinds: &[SyntaxKind]) -> Self {
        let mut mask = [0u128; TOKEN_SET_WORDS];
        let mut i = 0;
        while i < kinds.len() {
            let raw = kinds[i] as u16;
            assert!(raw < TOKEN_SET_CAPACITY, "TokenSet: kind out of range");
            let word = (raw / 128) as usize;
            let bit = raw % 128;
            mask[word] |= 1u128 << bit;
            i += 1;
        }
        TokenSet(mask)
    }

    pub(crate) const fn union(self, other: TokenSet) -> TokenSet {
        let mut out = [0u128; TOKEN_SET_WORDS];
        let mut i = 0;
        while i < TOKEN_SET_WORDS {
            out[i] = self.0[i] | other.0[i];
            i += 1;
        }
        TokenSet(out)
    }

    pub(crate) const fn contains(self, kind: SyntaxKind) -> bool {
        let raw = kind as u16;
        if raw >= TOKEN_SET_CAPACITY {
            return false;
        }
        let word = (raw / 128) as usize;
        let bit = raw % 128;
        (self.0[word] & (1u128 << bit)) != 0
    }
}

impl Default for TokenSet {
    fn default() -> Self {
        TokenSet::EMPTY
    }
}

/// The recovering parser. Owns the token slice plus an in-flight event
/// buffer. A non-trivia cursor jumps over whitespace/comment tokens so
/// grammar code never sees them — the builder reassembles trivia when
/// consuming events.
pub(crate) struct Parser<'a> {
    tokens: &'a [LexToken],
    /// Position in `tokens` — the index of the next *significant* token.
    /// Trivia between significant tokens is implicitly skipped.
    pos: usize,
    events: Vec<Event>,
}

impl<'a> Parser<'a> {
    pub(crate) fn new(tokens: &'a [LexToken]) -> Self {
        Self {
            tokens,
            pos: skip_trivia(tokens, 0),
            events: Vec::new(),
        }
    }

    /// Current significant token kind, or `EOF` if exhausted.
    pub(crate) fn current(&self) -> SyntaxKind {
        self.tokens
            .get(self.pos)
            .map_or(SyntaxKind::EOF, |t| t.kind)
    }

    /// Look ahead `n` significant tokens. `nth(0) == current()`.
    #[allow(dead_code)]
    pub(crate) fn nth(&self, n: usize) -> SyntaxKind {
        let mut idx = self.pos;
        let mut remaining = n;
        loop {
            if idx >= self.tokens.len() {
                return SyntaxKind::EOF;
            }
            if self.tokens[idx].kind.is_trivia() {
                idx += 1;
                continue;
            }
            if remaining == 0 {
                return self.tokens[idx].kind;
            }
            remaining -= 1;
            idx += 1;
        }
    }

    pub(crate) fn at(&self, kind: SyntaxKind) -> bool {
        self.current() == kind
    }

    pub(crate) fn at_ts(&self, set: TokenSet) -> bool {
        set.contains(self.current())
    }

    /// Case-insensitive match against a contextual keyword spelled as an
    /// identifier. Cypher allows unreserved identifiers (e.g. `WITH` when
    /// scoped by context); for v1 we only need this for the composite
    /// operators `STARTS WITH` / `ENDS WITH`, which are handled via the
    /// already-tokenised `STARTS_KW` / `ENDS_KW` and thus do not need the
    /// hook. Left here as a single call site in case later beads do.
    #[allow(dead_code)]
    pub(crate) fn at_contextual(&self, ident: &str) -> bool {
        self.current() == SyntaxKind::IDENT
            && self
                .tokens
                .get(self.pos)
                .is_some_and(|t| t.text.eq_ignore_ascii_case(ident))
    }

    /// Byte offset of the current token (or end-of-input if exhausted).
    fn current_offset(&self) -> text_size::TextSize {
        self.tokens.get(self.pos).map_or_else(
            || {
                self.tokens
                    .last()
                    .map_or(text_size::TextSize::from(0), |t| t.range.end())
            },
            |t| t.range.start(),
        )
    }

    /// Consume whatever token is current (including EOF-no-op) and emit it
    /// as a Token event. Use sparingly — prefer `bump(kind)` when the kind
    /// is known for the debug assertion.
    pub(crate) fn bump_any(&mut self) {
        let kind = self.current();
        if kind == SyntaxKind::EOF {
            return;
        }
        self.events.push(Event::Token { kind });
        self.pos += 1;
        self.pos = skip_trivia(self.tokens, self.pos);
    }

    /// Consume the current token, asserting it has the given kind in debug.
    pub(crate) fn bump(&mut self, kind: SyntaxKind) {
        debug_assert!(
            self.at(kind),
            "bump: expected {kind:?}, got {:?}",
            self.current()
        );
        self.bump_any();
    }

    /// Consume `kind` if present; return whether it was.
    pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool {
        if self.at(kind) {
            self.bump(kind);
            true
        } else {
            false
        }
    }

    /// Record a diagnostic at the current position without consuming.
    ///
    /// `code` is the numeric discriminant of the corresponding `DiagCode`
    /// variant in `cyrs-diag` (e.g. `3` for `E0003`). Use the
    /// `syntax_codes` constants in this module for clarity.
    pub(crate) fn error_code(&mut self, code: u16, msg: impl Into<String>) {
        let offset = self.current_offset();
        self.events.push(Event::Error {
            code,
            msg: msg.into(),
            offset,
        });
    }

    /// Record a generic syntax error (E0001) at the current position.
    ///
    /// Prefer [`Parser::error_code`] with a specific code at every call
    /// site that maps to a known error class.
    pub(crate) fn error(&mut self, msg: impl Into<String>) {
        self.error_code(syntax_codes::GENERIC_SYNTAX_ERROR, msg);
    }

    /// Consume one token and wrap it in an ERROR node alongside a message.
    #[allow(dead_code)]
    pub(crate) fn err_and_bump(&mut self, msg: impl Into<String>) {
        let m = self.start();
        self.error(msg);
        self.bump_any();
        m.complete(self, SyntaxKind::ERROR);
    }

    /// Require `kind`. If present, consume it; otherwise emit a diagnostic
    /// at the expected position. The parser continues — the missing
    /// closer acts as a zero-width `ERROR` in the event stream
    /// (virtual-token insertion per spec §4.3).
    #[allow(dead_code)]
    pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool {
        if self.eat(kind) {
            true
        } else {
            self.error_code(syntax_codes::EXPECTED_TOKEN, format!("expected {kind:?}"));
            false
        }
    }

    /// Skip tokens until we hit one in `set`, an already-synchronising
    /// clause keyword, a `;`, or EOF. Skipped tokens are wrapped in a
    /// single `ERROR` node (spec §4.3 — synchronisation-set skip).
    pub(crate) fn recover_until(&mut self, set: TokenSet) {
        let stop = set.union(RECOVERY_STOP);
        if stop.contains(self.current()) || self.current() == SyntaxKind::EOF {
            return;
        }
        let m = self.start();
        while !stop.contains(self.current()) && self.current() != SyntaxKind::EOF {
            self.bump_any();
        }
        m.complete(self, SyntaxKind::ERROR);
    }

    /// Begin a new node. Returns a `Marker` that must be completed or
    /// abandoned before it is dropped.
    pub(crate) fn start(&mut self) -> Marker {
        let pos = self.events.len();
        self.events.push(Event::Start {
            kind: SyntaxKind::ERROR, // placeholder; replaced by complete()
            forward_parent: None,
        });
        Marker::new(pos)
    }

    /// Return the current token-cursor position. Used by the no-progress
    /// guard in `source_file` to detect loops where no token is consumed.
    pub(crate) fn position(&self) -> usize {
        self.pos
    }

    pub(crate) fn into_events(self) -> Vec<Event> {
        self.events
    }
}

// ========================================================================
// Syntax error code constants (spec §10.2, E0001–E0999)
// ========================================================================
//
// These numeric values MUST match the discriminants of the corresponding
// `DiagCode` enum variants in `cyrs-diag/src/codes.rs`.  `cyrs-syntax`
// cannot depend on `cyrs-diag` (§3.1), so the codes are mirrored here
// as plain `u16` constants.  CI enforces that each constant is actually
// used at a call site and that its value exists in the registry.

pub(crate) mod syntax_codes {
    /// E0001 — generic / unclassified syntax error.
    pub(crate) const GENERIC_SYNTAX_ERROR: u16 = 1;
    /// E0002 — unexpected token.
    pub(crate) const UNEXPECTED_TOKEN: u16 = 2;
    /// E0003 — expected `<token>`, found `<token>`.
    pub(crate) const EXPECTED_TOKEN: u16 = 3;
    /// E0004 — unclosed string literal.
    pub(crate) const UNCLOSED_STRING: u16 = 4;
    /// E0005 — unclosed block comment.
    pub(crate) const UNCLOSED_BLOCK_COMMENT: u16 = 5;
    /// E0006 — invalid numeric literal.
    pub(crate) const INVALID_NUMERIC_LITERAL: u16 = 6;
    /// E0007 — expected statement.
    pub(crate) const EXPECTED_STATEMENT: u16 = 7;
    /// E0008 — expected `;` or end of input.
    pub(crate) const EXPECTED_SEMICOLON_OR_EOF: u16 = 8;
    /// E0009 — expected `(` to start a node pattern.
    pub(crate) const EXPECTED_LPAREN_NODE: u16 = 9;
    /// E0010 — expected node pattern after relationship.
    pub(crate) const EXPECTED_NODE_AFTER_REL: u16 = 10;
    /// E0011 — expected `)` to close node pattern.
    pub(crate) const EXPECTED_RPAREN_NODE: u16 = 11;
    /// E0012 — expected `-` at relationship start.
    pub(crate) const EXPECTED_DASH_REL_START: u16 = 12;
    /// E0013 — expected `-` to close relationship.
    pub(crate) const EXPECTED_DASH_REL_CLOSE: u16 = 13;
    /// E0014 — expected `-` or `->` to close relationship.
    pub(crate) const EXPECTED_DASH_OR_ARROW: u16 = 14;
    /// E0015 — expected `]` to close relationship detail.
    pub(crate) const EXPECTED_RBRACK_REL: u16 = 15;
    /// E0016 — expected label after `:`.
    pub(crate) const EXPECTED_LABEL: u16 = 16;
    /// E0017 — expected relationship type after `:`.
    pub(crate) const EXPECTED_REL_TYPE: u16 = 17;
    /// E0018 — expected `}` to close property map.
    pub(crate) const EXPECTED_RBRACE_PROP: u16 = 18;
    /// E0019 — expected property key.
    pub(crate) const EXPECTED_PROP_KEY: u16 = 19;
    /// E0020 — expected `:` in property entry.
    pub(crate) const EXPECTED_COLON_PROP: u16 = 20;
    /// E0021 — expected expression for property value.
    pub(crate) const EXPECTED_PROP_VALUE: u16 = 21;
    /// E0022 — expected identifier.
    pub(crate) const EXPECTED_IDENT: u16 = 22;
    /// E0023 — expression nesting limit exceeded.
    pub(crate) const EXPR_NESTING_LIMIT: u16 = 23;
    /// E0024 — expected operand after unary operator.
    pub(crate) const EXPECTED_UNARY_OPERAND: u16 = 24;
    /// E0025 — expected `NULL` after `IS`.
    pub(crate) const EXPECTED_NULL_AFTER_IS: u16 = 25;
    /// E0026 — expected right-hand side of binary expression.
    pub(crate) const EXPECTED_BINOP_RHS: u16 = 26;
    /// E0027 — expected expression inside parentheses.
    pub(crate) const EXPECTED_EXPR_IN_PARENS: u16 = 27;
    /// E0028 — expected `)` to close expression.
    pub(crate) const EXPECTED_RPAREN_EXPR: u16 = 28;
    /// E0029 — expected `WITH` after `STARTS`.
    pub(crate) const EXPECTED_WITH_AFTER_STARTS: u16 = 29;
    /// E0030 — expected `WITH` after `ENDS`.
    pub(crate) const EXPECTED_WITH_AFTER_ENDS: u16 = 30;
    /// E0031 — expected property key after `.`.
    pub(crate) const EXPECTED_PROP_KEY_AFTER_DOT: u16 = 31;
    /// E0032 — expected index expression.
    pub(crate) const EXPECTED_INDEX_EXPR: u16 = 32;
    /// E0033 — expected `]` to close index expression.
    pub(crate) const EXPECTED_RBRACK_INDEX: u16 = 33;
    /// E0034 — expected `)` to close function call.
    pub(crate) const EXPECTED_RPAREN_CALL: u16 = 34;
    /// E0035 — expected function argument.
    pub(crate) const EXPECTED_CALL_ARG: u16 = 35;
    /// E0036 — expected expression in `RETURN` item.
    pub(crate) const EXPECTED_RETURN_EXPR: u16 = 36;
    /// E0037 — expected identifier after `AS`.
    pub(crate) const EXPECTED_IDENT_AFTER_AS: u16 = 37;
    /// E0038 — expected `BY` after `ORDER`.
    pub(crate) const EXPECTED_BY_AFTER_ORDER: u16 = 38;
    /// E0039 — expected expression in `ORDER BY`.
    pub(crate) const EXPECTED_ORDERBY_EXPR: u16 = 39;
    /// E0040 — expected expression after `SKIP`.
    pub(crate) const EXPECTED_SKIP_EXPR: u16 = 40;
    /// E0041 — expected expression after `LIMIT`.
    pub(crate) const EXPECTED_LIMIT_EXPR: u16 = 41;
    /// E0042 — expected `MATCH` after `OPTIONAL`.
    pub(crate) const EXPECTED_MATCH_AFTER_OPTIONAL: u16 = 42;
    /// E0043 — expected expression after `WHERE`.
    pub(crate) const EXPECTED_WHERE_EXPR: u16 = 43;
    /// E0044 — clause not yet implemented (deferred construct).
    /// E0044 — was the cy-nom-era "unimplemented clause" stub error
    /// emitted by `deferred_clause_stub`. cy-4mg landed CALL — the last
    /// previously-deferred clause — so the helper is gone. Code number
    /// retained to preserve the diagnostic registry's monotonic numbering.
    #[allow(dead_code)]
    pub(crate) const UNIMPLEMENTED_CLAUSE_RESERVED: u16 = 44;
    /// E0045 — expected a clause keyword.
    pub(crate) const EXPECTED_CLAUSE: u16 = 45;
    /// E0046 — invalid escape sequence in string literal.
    pub(crate) const INVALID_ESCAPE: u16 = 46;
    /// E0047 — expected expression in list literal.
    pub(crate) const EXPECTED_LIST_ELEM: u16 = 47;
    /// E0048 — expected `]` to close list literal.
    pub(crate) const EXPECTED_RBRACK_LIST: u16 = 48;
    /// E0049 — expected key in map literal.
    pub(crate) const EXPECTED_MAP_KEY: u16 = 49;
    /// E0050 — expected `:` in map entry.
    pub(crate) const EXPECTED_COLON_MAP: u16 = 50;
    /// E0051 — expected expression for map value.
    pub(crate) const EXPECTED_MAP_VALUE: u16 = 51;
    /// E0052 — expected `}` to close map literal.
    pub(crate) const EXPECTED_RBRACE_MAP: u16 = 52;
    /// E0053 — expected expression after `UNWIND`.
    pub(crate) const EXPECTED_UNWIND_EXPR: u16 = 53;
    /// E0054 — expected `AS` after `UNWIND` expression.
    pub(crate) const EXPECTED_AS_UNWIND: u16 = 54;
    /// E0055 — expected pattern after `CREATE`.
    pub(crate) const EXPECTED_CREATE_PATTERN: u16 = 55;
    /// E0056 — expected pattern after `MERGE`.
    pub(crate) const EXPECTED_MERGE_PATTERN: u16 = 56;
    /// E0057 — expected SET item (property assignment or label add).
    pub(crate) const EXPECTED_SET_ITEM: u16 = 57;
    /// E0058 — expected REMOVE item (property or label).
    pub(crate) const EXPECTED_REMOVE_ITEM: u16 = 58;
    /// E0059 — expected expression after `DELETE`.
    pub(crate) const EXPECTED_DELETE_EXPR: u16 = 59;
    /// E0060 — expected `DELETE` after `DETACH`.
    pub(crate) const EXPECTED_DELETE_AFTER_DETACH: u16 = 60;
    /// E0061 — expected `CREATE` or `MATCH` after `ON` in MERGE action.
    pub(crate) const EXPECTED_ON_ACTION: u16 = 61;
    /// E0062 — expected `]` to close variable-length hop.
    #[allow(dead_code)]
    pub(crate) const EXPECTED_RBRACK_HOP: u16 = 62;
    /// E0063 — expected `=` after path binder identifier.
    #[allow(dead_code)]
    pub(crate) const EXPECTED_EQ_PATH_BIND: u16 = 63;
    /// E0064 — expected `]` to close an indexing / slicing bracket.
    ///
    /// Distinct from `EXPECTED_RBRACK_INDEX` (E0033) which covers the
    /// legacy `SUBSCRIPT_EXPR` path: E0064 is emitted by the typed
    /// index / slice recovery introduced in cy-7s6.1.
    pub(crate) const UNCLOSED_INDEX_BRACKET: u16 = 64;
    /// E0065 — expected `(` after ANY / ALL / NONE / SINGLE keyword to
    /// begin the list-predicate arg list (cy-8x5).
    pub(crate) const EXPECTED_LPAREN_LIST_PREDICATE: u16 = 65;
    /// E0066 — expected `IN` between the list-predicate binder and its
    /// source expression (cy-8x5).
    pub(crate) const EXPECTED_IN_LIST_PREDICATE: u16 = 66;
    /// E0067 — expected `)` to close a list-predicate expression
    /// (cy-8x5).
    pub(crate) const EXPECTED_RPAREN_LIST_PREDICATE: u16 = 67;
    /// E0068 — expected `IN` in list comprehension.
    ///
    /// Emitted by the list-comprehension parse path when the first
    /// token inside `[` is an identifier but is not followed by the
    /// `IN` keyword — cy-5gh.
    pub(crate) const EXPECTED_IN_LIST_COMP: u16 = 68;
    /// E0069 — expected `|` or `]` in list comprehension.
    ///
    /// Emitted when a list comprehension's trailing segment is
    /// malformed — either a missing `|` before the projection
    /// expression or a missing `]` closer — cy-5gh.
    pub(crate) const EXPECTED_PIPE_OR_RBRACK_LIST_COMP: u16 = 69;
    /// E0070 — expected `THEN` in CASE arm.
    ///
    /// Emitted by the parser's `CASE` production when a `WHEN <value>`
    /// clause is not followed by the mandatory `THEN` keyword — cy-41u.
    pub(crate) const EXPECTED_THEN_CASE: u16 = 70;
    /// E0071 — expected `END` to close a CASE expression.
    ///
    /// Emitted by the parser's `CASE` production when the terminating
    /// `END` keyword is missing after the final arm / optional `ELSE`
    /// branch — cy-41u.
    pub(crate) const EXPECTED_END_CASE: u16 = 71;
    /// E0072 — expected `)` to close an `EXISTS(<pattern>)` pattern
    /// predicate (cy-lve, spec §6.1 / §19 row "Pattern predicates in
    /// expressions").
    pub(crate) const EXPECTED_RPAREN_EXISTS: u16 = 72;
    /// E0073 — expected procedure name after `CALL` (cy-4mg, spec §14 /
    /// §19 row "CALL <proc> YIELD ..."). The procedure-name production
    /// is `IDENT ('.' IDENT)*`; emitted when the token after `CALL` is
    /// not an identifier.
    pub(crate) const EXPECTED_PROCEDURE_NAME: u16 = 73;
    /// E0074 — expected `)` to close the argument list of a CALL
    /// invocation (cy-4mg, spec §14).
    pub(crate) const EXPECTED_RPAREN_CALL_ARGS: u16 = 74;
    /// E0075 — expected identifier in a YIELD item (cy-4mg, spec §14 /
    /// ungrammar `YieldItem = NameRef ('AS' NameDef)?`).
    pub(crate) const EXPECTED_YIELD_ITEM: u16 = 75;
    /// E0077 — expected `)` to close a `shortestPath` /
    /// `allShortestPaths` pattern function argument (cy-b5b, spec §6.4 /
    /// §19 row "shortest-path"). E0076 reserved for `CALL { ... }`
    /// block subquery (cy-4mg follow-up — block form deferred per §20 D1).
    pub(crate) const EXPECTED_RPAREN_SHORTEST_PATH: u16 = 77;
    /// E0078 — expected `}` to close a map projection (cy-01q, spec §6.1 /
    /// §19 row "Map projection"). Distinct from E0052 (map literal close)
    /// so tooling can tell projection from literal recovery apart.
    /// Renumbered from E0073 to avoid collision with cy-4mg.
    pub(crate) const EXPECTED_RBRACE_MAP_PROJECTION: u16 = 78;
    /// E0079 — expected property name or `*` after `.` in a map projection
    /// item (cy-01q). The `.` opens either `.NAME` (property selector) or
    /// `.*` (all-properties spread); anything else is an error.
    /// Renumbered from E0074.
    pub(crate) const EXPECTED_PROP_OR_STAR_AFTER_DOT_IN_PROJECTION: u16 = 79;
    /// E0080 — expected `:` in a map projection literal item (cy-01q).
    /// Distinct from E0050 (map literal `:`) for the same reason as
    /// E0078 vs E0052. Renumbered from E0075.
    pub(crate) const EXPECTED_COLON_MAP_PROJECTION: u16 = 80;
    /// E0081 — expected expression for map projection value (cy-01q).
    /// Renumbered from E0076.
    pub(crate) const EXPECTED_VALUE_MAP_PROJECTION: u16 = 81;
    /// E0082 — expected an item in map projection: `.name`, `key: expr`,
    /// `.*`, or `*` (cy-01q). Renumbered from E0077.
    pub(crate) const EXPECTED_MAP_PROJECTION_ITEM: u16 = 82;

    // ---- dialect gates (E4xxx, shared with cyrs-diag::codes) -----------
    // The `error_code` payload is the numeric part of a `DiagCode`
    // variant. Dialect-gate codes live in `crates/cyrs-diag/src/codes.rs`
    // and are referenced here as `u16` constants so the parser can emit
    // them without a cross-crate dependency.

    /// E4017 — `EXISTS { <subquery> }` block form is deferred per spec
    /// §9.3 / §19 / §20 D1 (cy-lve, tranche A). Registered in
    /// `cyrs-diag::codes` (`DiagCode::E4017`, `exists_subquery`
    /// dialect gate). Emitted by the parser when it encounters
    /// `EXISTS {` in an atom position.
    pub(crate) const EXISTS_BLOCK_DEFERRED: u16 = 4017;
}

/// Advance `idx` past any leading trivia tokens.
fn skip_trivia(tokens: &[LexToken], mut idx: usize) -> usize {
    while idx < tokens.len() && tokens[idx].kind.is_trivia() {
        idx += 1;
    }
    idx
}

/// Clause-start keywords + statement terminators that always end recovery.
/// Any clause recovery set is implicitly unioned with this.
pub(crate) const RECOVERY_STOP: TokenSet = TokenSet::new(&[
    SyntaxKind::MATCH_KW,
    SyntaxKind::OPTIONAL_KW,
    SyntaxKind::WHERE_KW,
    SyntaxKind::WITH_KW,
    SyntaxKind::RETURN_KW,
    SyntaxKind::CREATE_KW,
    SyntaxKind::MERGE_KW,
    SyntaxKind::SET_KW,
    SyntaxKind::REMOVE_KW,
    SyntaxKind::DELETE_KW,
    SyntaxKind::DETACH_KW,
    SyntaxKind::UNWIND_KW,
    SyntaxKind::CALL_KW,
    SyntaxKind::UNION_KW,
    SyntaxKind::SEMI,
]);

// ========================================================================
// Marker / CompletedMarker (drop-bomb protected)
// ========================================================================

/// A pending node-open. Must be either `complete`d or `abandon`ed before
/// it is dropped. The [`DropBomb`] panics in debug builds if dropped.
#[must_use = "Marker must be completed or abandoned"]
pub(crate) struct Marker {
    pos: u32,
    bomb: DropBomb,
}

impl Marker {
    fn new(pos: usize) -> Self {
        Self {
            pos: u32::try_from(pos).expect("event index fits u32"),
            bomb: DropBomb::new("Marker must be completed or abandoned"),
        }
    }

    /// Close the node with the given kind, yielding a `CompletedMarker`
    /// that can be used for Pratt-style re-parenting via [`CompletedMarker::precede`].
    pub(crate) fn complete(mut self, p: &mut Parser<'_>, kind: SyntaxKind) -> CompletedMarker {
        self.bomb.defuse();
        let idx = self.pos as usize;
        match &mut p.events[idx] {
            Event::Start { kind: slot, .. } => *slot = kind,
            _ => unreachable!("marker index does not point to a Start event"),
        }
        p.events.push(Event::Finish);
        CompletedMarker {
            pos: self.pos,
            kind,
        }
    }

    /// Drop the speculative node. If no events were emitted after the
    /// marker's Start, we truncate; otherwise we tombstone the Start.
    #[allow(dead_code)]
    pub(crate) fn abandon(mut self, p: &mut Parser<'_>) {
        self.bomb.defuse();
        let idx = self.pos as usize;
        if idx + 1 == p.events.len() {
            p.events.pop();
        } else {
            p.events[idx] = Event::Abandoned;
        }
    }
}

/// A completed node handle. Can be retroactively re-parented via
/// [`CompletedMarker::precede`] — central to Pratt expression parsing.
#[derive(Debug, Copy, Clone)]
pub(crate) struct CompletedMarker {
    pos: u32,
    #[allow(dead_code)]
    kind: SyntaxKind,
}

impl CompletedMarker {
    /// Open a new outer node that will re-parent this completed node. The
    /// returned `Marker` must be completed; the outer node's Start/Finish
    /// will surround this node's Start/Finish in the emitted tree.
    pub(crate) fn precede(self, p: &mut Parser<'_>) -> Marker {
        let new_marker = p.start();
        // Link this completed Start's forward_parent to the new marker.
        let old_idx = self.pos as usize;
        let new_idx = new_marker.pos as usize;
        let delta = u32::try_from(new_idx - old_idx).expect("forward-parent delta fits u32");
        match &mut p.events[old_idx] {
            Event::Start { forward_parent, .. } => {
                debug_assert!(forward_parent.is_none(), "forward_parent already set");
                *forward_parent = Some(delta);
            }
            _ => unreachable!("CompletedMarker pos does not point to a Start"),
        }
        new_marker
    }
}

#[cfg(test)]
mod tests {
    use super::parse;
    use crate::SyntaxKind;

    #[test]
    fn parse_empty_is_lossless() {
        let p = parse("");
        assert_eq!(p.syntax().to_string(), "");
    }

    #[test]
    fn parse_simple_is_lossless() {
        let src = "MATCH (n) RETURN n";
        let p = parse(src);
        assert_eq!(p.syntax().to_string(), src);
    }

    #[test]
    fn parse_with_comments_is_lossless() {
        let src = "// leading\nMATCH (n) /* inline */ RETURN n";
        let p = parse(src);
        assert_eq!(p.syntax().to_string(), src);
    }

    #[test]
    fn root_kind_is_source_file() {
        let p = parse("MATCH (n) RETURN n");
        assert_eq!(p.syntax().kind(), SyntaxKind::SOURCE_FILE);
    }

    use proptest::prelude::*;

    proptest! {
        /// Property P17.3.2 — lossless CST over arbitrary UTF-8 input.
        #[test]
        fn lossless_on_arbitrary_utf8(s in ".*") {
            let p = parse(&s);
            prop_assert_eq!(p.syntax().to_string(), s);
        }

        /// Property P17.3.1 — parser totality: no panic, bounded time.
        #[test]
        fn parser_is_total_on_random_utf8(s in ".*") {
            let _ = parse(&s);
        }
    }
}