selene-db-gql 1.3.0

ISO/IEC 39075:2024 GQL parser, planner, optimizer, and executor for selene-db.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
//! Cheap parser input guards that run before pest recursive descent.

use crate::{SourceSpan, error::ParserError};

/// Maximum syntactic nesting depth admitted by the parser.
///
/// This bounds pest recursion on hostile malformed expressions while leaving
/// ordinary query, list, record, and subquery nesting comfortably below the cap.
///
/// **Load-bearing safety invariant — do not raise without re-checking the stack
/// budget.** This cap also bounds *subquery* nesting (`EXISTS`/`COUNT`/`VALUE`/
/// `CALL { … }` each open a counted `{`). The post-build expression-depth guard
/// (`super::depth`) resets its depth counter at each subquery boundary, so the
/// recursive Flagger walk's worst-case native-stack depth is the *product*
/// `MAX_NESTING_DEPTH × depth::MAX_EXPR_DEPTH` (≈ 64 × 256 ≈ 16 k frames) — a
/// deeply nested subquery whose every level carries a max-depth expression. That
/// fits only because `parse` runs the whole descent + build + Flagger on the
/// 32 MB `PARSE_STACK_SEGMENT` (`parser/mod.rs`). Raising this cap
/// (or `MAX_EXPR_DEPTH`, or shrinking the segment) must keep
/// `MAX_NESTING_DEPTH × MAX_EXPR_DEPTH × per_frame_bytes` safely under the
/// segment, or the non-unwindable Flagger overflow re-opens with no guard to
/// catch it. See `tests/parser_expr_depth.rs::nested_subqueries_with_deep_folds_do_not_crash`.
pub(crate) const MAX_NESTING_DEPTH: u32 = 64;

/// Maximum simultaneously-open `[` list nesting depth admitted in a single
/// statement.
///
/// pest is not packrat-memoized, so a run of unclosed list-literal openers can
/// still drive expensive nested expression attempts before the parser can
/// reject the input. Parsing precedes execution, so an execution deadline cannot
/// interrupt the blow-up; the only safe place to stop it is before recursive
/// descent begins.
///
/// This caps the *depth* of simultaneously-open `[`, **not** a total opener
/// count, because depth is the actual blow-up driver. A balanced, promptly
/// closed bracket never nests: an edge pattern `-[:E]->` and a flat list
/// `[1, 2, 3]` each return to `[`-depth 0 before the next opener, so a path of
/// arbitrarily many hops or an arbitrarily wide list stays at `[`-depth 1 and
/// is always admitted — only genuinely nested `[` accrue depth. (A total-count
/// cap, by contrast, would reject a legitimate ten-hop fixed path purely for
/// its opener count.) The cap of 32 is an order of magnitude above the deepest
/// legitimate `[` nesting anywhere in the workspace (3, a `[[[1]]]` literal)
/// and comfortably below the ~57-deep empirical blow-up point. `(` and `{`
/// nesting is not a demonstrated backtracking vector (the fuzz corpus contains
/// only `[`) and is bounded by [`MAX_NESTING_DEPTH`] alone.
pub(crate) const MAX_LIST_NESTING_DEPTH: u32 = 32;

/// Maximum zero-delimiter recursive-descent depth admitted by the parser.
///
/// pest's recursive descent recurses one stack frame per nesting level. Three
/// `grammar.pest` rules recurse back toward `expr` with **no** guarded
/// delimiter (`(`/`[`/`{`) in between, so [`MAX_NESTING_DEPTH`] (which only
/// counts delimiters) cannot bound them:
///
/// - `unary = { sign_op ~ unary | postfix }` (`grammar.pest:380`) — a *run* of
///   leading unary `+`/`-` signs.
/// - `not_expr = { not_kw ~ not_expr | is_expr }` (`grammar.pest:324`) — a *run*
///   of `NOT` keywords.
/// - `case_expr` ↔ `expr` (`grammar.pest:447`) — *nested* `CASE … END`
///   expressions: each level re-enters the full precedence cascade for its
///   `WHEN`/`THEN`/`ELSE` operand exprs. `CASE` is keyword-delimited, and a byte
///   scan cannot soundly recognize its `END` *closer* (see below), so this is
///   tracked by a **monotone** `case_depth` — a sound over-approximation of
///   open-`CASE` pressure that never decrements — rather than a balanced counter.
///
/// `NOT`/`CASE`/`END` are reserved keywords (`grammar.pest` `keyword` set), so
/// they cannot be *bare* identifiers — but `prop_ident` (`grammar.pest:602`) has
/// no keyword guard, so they appear as identifiers in property names (`n.END`),
/// map/record keys (`{END: …}`), aliases (`AS END`), `YIELD` columns
/// (`YIELD a, END`), and parameters (`$END`). This is why `case_depth` is
/// **monotone**, with an asymmetry between the opener and the closer:
///
/// - A `CASE` **opener** is recognized soundly. It is counted only when *not* in
///   an identifier position (`.`/`$` lookbehind, an `AS`/`YIELD` predecessor, or
///   a `:` follower — see [`validate`]), and the grammar never places a real
///   `case_expr` in any of those slots, so skipping an identifier-`CASE` can
///   never *under*-count a real frame.
/// - The `END` **closer** cannot be told apart from an identifier `END` in every
///   position — a comma-tail `YIELD` column (`CALL p() YIELD a, END`) is a bare
///   `prop_ident` that a lookbehind/lookahead cannot distinguish from the
///   keyword. So the guard **never decrements** on `END`. Trusting `END` as a
///   closer would let a phantom identifier-`END` cancel a real open `CASE` and
///   reopen the overflow (a confirmed bypass).
///
/// The cost of monotonicity is a *documented, conformant* limit: a single
/// statement's combined `CASE`-count-plus-nesting-pressure must stay ≤
/// [`MAX_RECURSION_DEPTH`], so a pathological statement with hundreds of (even
/// non-nested) `CASE` expressions is rejected with the program-limit GQLSTATUS.
/// Realistic queries (a handful of `CASE`s, shallow nesting) are unaffected — the
/// deepest legitimate nesting anywhere in the workspace is 3.
///
/// (The `type_name` `LIST<…>` recursion is *also* zero-delimiter, but `LIST` is
/// **not** reserved — it is a legal bare variable and `LIST < x` is a comparison
/// — so a byte scan cannot soundly distinguish the type opener from a variable.
/// `LIST<…>` is instead bounded at AST-build time by `build_type_name_with_depth`
/// (`builders::expr`, depth 64); only the pre-build pest-descent tail of a very
/// large input remains, absorbed by the `stacker` parse backstop in
/// `parser/mod.rs`.)
///
/// A long enough zero-delimiter run/nesting overflows the native stack inside
/// pest. A Rust stack overflow is **non-unwindable** (`catch_unwind` cannot trap
/// it), so a small hostile string would hard-kill the host process embedding
/// selene-db. This pre-pest cap rejects such inputs deterministically before
/// recursive descent begins, mapped to GQLSTATUS `5GQL1`.
///
/// **This is a single COMBINED ceiling on total recursion pressure**, not a
/// per-counter cap. The native stack depth at the deepest point is the *sum* of
/// every simultaneously-open recursion frame — open delimiters (`depth`) *plus*
/// the current unary-sign chain *plus* the `NOT` chain *plus* the open `CASE`
/// frames — because they nest as operands of one another (`(` `-` `CASE WHEN (` …
/// each frame stays open until its operand reduces). Capping each counter
/// independently would admit their *product* (e.g. 64 nested `(` each carrying a
/// 255-deep sign chain ≈ 16 k frames) while every individual counter stayed under
/// its cap — a real stack-overflow vector. So the guard bounds the running SUM
/// `depth + sign_run + not_run + case_depth`, maintained as a sound upper bound
/// on the live native-stack depth at every position
/// (`depth + sign_run + not_run + case_depth ≥ true_open_frames`), so the cap is
/// detected at or before the deepest point. The counters differ in how they
/// reduce:
///
/// - The *run* counters (`sign_run`, `not_run`) reset at a value/closer (where
///   the active chain reduces) but never at an opener.
/// - `depth` is a balanced delimiter counter (decrements at `)`/`}`/`]`).
/// - `case_depth` is **monotone** (never decrements — see above). A real `CASE`
///   opener folds the wrapping sign/`NOT` run into it
///   (`case_depth += 1 + sign_run + not_run`) before resetting those runs, so the
///   run frames that stay open across the `CASE` operands are not lost. This is
///   what bounds `NOT … NOT CASE … THEN NOT … NOT CASE …`, whose true descent
///   depth is `levels × (run + 1)` — folding keeps the sum ≥ that depth.
///
/// The delimiter caps [`MAX_NESTING_DEPTH`] (64) and [`MAX_LIST_NESTING_DEPTH`]
/// (32) remain as tighter, more precise sub-bounds.
///
/// The cap mirrors `ANALYZER_MAX_DEPTH = 256` (`analyze/bind/mod.rs:29`) and is
/// a deliberate safety floor, not a tuning knob: it sits ~4-8x below the
/// smallest realistic stack's overflow point (a 2 MB nextest thread overflows
/// the sign chain at ~1000-2000) so regression tests can replay an over-cap
/// input without crashing the test runner, while the deepest legitimate nesting
/// of any kind anywhere in the workspace is 3.
pub(crate) const MAX_RECURSION_DEPTH: u32 = 256;

pub(super) fn validate(source: &str) -> Result<(), ParserError> {
    let bytes = source.as_bytes();
    // Index of the final `'` in the input. A single-quoted string treats `\'`
    // as an escaped quote ONLY when a later `'` exists (pest `escaped_quote`);
    // when the `'` after a `\` is the last quote, the `\` is a dangling escape
    // and the `'` closes the string (pest `dangling_escape`). Mirroring this is
    // load-bearing: a blanket "`\` escapes the next byte" rule lets `RETURN '\'`
    // skip past the real closing quote to EOF, hiding a hostile `[` run that
    // pest still closes the string before and parses — a parser-time DoS bypass.
    let last_single_quote = bytes.iter().rposition(|byte| *byte == b'\'');
    let last_double_quote = bytes.iter().rposition(|byte| *byte == b'"');
    let last_backtick = bytes.iter().rposition(|byte| *byte == b'`');
    let mut index = 0;
    let mut depth = 0_u32;
    let mut list_depth = 0_u32;
    // Recursion-pressure counters (see `MAX_RECURSION_DEPTH`). Their SUM with
    // `depth` is the bounded quantity: it tracks the native stack depth at the
    // current position. pest treats comments as whitespace, so a comment between
    // two signs does NOT break the unary run (`- /* c */ -` is a 2-deep `unary`).
    // The *run* counters (`sign_run`, `not_run`) reset at a value (a primary —
    // string/backtick/number/other word) or a closer; the *balanced* counters
    // (`depth`, `case_depth`) decrement only at their matching closer (delimiter
    // closer, keyword `END`). An OPENER never resets a run, so a chain frozen open
    // across a delimiter keeps counting (otherwise the guard would admit the
    // product of the per-counter caps — see `MAX_RECURSION_DEPTH`).
    // Whitespace/comment arms `continue` without touching any counter OR the
    // `prev_*` lookbehind state, so adjacency survives an intervening comment.
    let mut sign_run = 0_u32;
    let mut not_run = 0_u32;
    // Monotone nested-`CASE` pressure (see `MAX_RECURSION_DEPTH`). A real keyword
    // `CASE` opener adds 1 plus the sign/`NOT` run that wraps it; it NEVER
    // decrements (a byte scan cannot soundly tell a keyword `END` closer from an
    // identifier `END`, e.g. a `YIELD` column). Counted only when the `CASE` word
    // is not in an identifier position — see the word arm.
    let mut case_depth = 0_u32;
    // Lookbehind for the word classifier. `prev_sig_byte` is the last
    // *significant* (non-whitespace, non-comment) byte; only `.`/`$` matter (a
    // word right after them is a `prop_ident`/`param_ref` identifier, never the
    // keyword). `prev_word` is the last significant *keyword word* that admits an
    // identifier after it (`AS <alias>`, `YIELD <item>`); a word right after one
    // of those is an identifier. Both are left UNCHANGED by whitespace/comment.
    let mut prev_sig_byte: Option<u8> = None;
    let mut prev_word = PrevWord::Other;

    while index < bytes.len() {
        match bytes[index] {
            // Quoted string/identifier spans are primaries for guard purposes:
            // they reset the unary and `NOT` runs (a primary terminates a
            // leading-sign / `NOT` chain), then the scan resumes after the
            // closing quote.
            b'@' if next_is(bytes, index, b'\'') => {
                sign_run = 0;
                not_run = 0;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(b'\'');
                index = skip_no_escape_quoted(bytes, index + 2, b'\'');
            }
            b'@' if next_is(bytes, index, b'"') => {
                sign_run = 0;
                not_run = 0;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(b'"');
                index = skip_no_escape_quoted(bytes, index + 2, b'"');
            }
            b'@' if next_is(bytes, index, b'`') => {
                sign_run = 0;
                not_run = 0;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(b'`');
                index = skip_no_escape_quoted(bytes, index + 2, b'`');
            }
            b'\'' => {
                sign_run = 0;
                not_run = 0;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(b'\'');
                index = skip_single_quoted(bytes, index + 1, last_single_quote);
            }
            b'"' => {
                sign_run = 0;
                not_run = 0;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(b'"');
                index = skip_double_quoted(bytes, index + 1, last_double_quote);
            }
            b'`' => {
                sign_run = 0;
                not_run = 0;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(b'`');
                index = skip_backtick_quoted(bytes, index + 1, last_backtick);
            }
            // Comments are whitespace to pest, so they do NOT reset any run or
            // the `prev_*` lookbehind: `- // c\n -` is still a 2-deep unary chain,
            // and `n ./* c */ END` still sees `.` before `END`. Skip the span and
            // continue without touching state. `--` is two unary minus `sign_op`
            // (`grammar.pest:642` defines `//` as the only line comment), so it is
            // handled by the `b'-'` arm, NOT skipped here.
            b'/' if next_is(bytes, index, b'/') => {
                index = skip_line_comment(bytes, index + 2);
                continue;
            }
            b'/' if next_is(bytes, index, b'*') => {
                index = skip_block_comment(bytes, index + 2);
                index += 1;
                continue;
            }
            // An opener does NOT reset the sign / `NOT` runs: the delimiter
            // expression is the operand of any enclosing unary chain, which
            // stays open and must keep counting toward the combined cap.
            b'(' | b'{' => {
                depth += 1;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(bytes[index]);
                if depth > MAX_NESTING_DEPTH {
                    return Err(ParserError::NestingLimitExceeded {
                        limit: MAX_NESTING_DEPTH,
                        span: point_span(index),
                    });
                }
                if exceeds_recursion_budget(depth, sign_run, not_run, case_depth) {
                    return Err(ParserError::ComplexityLimitExceeded {
                        limit: MAX_RECURSION_DEPTH,
                        span: point_span(index),
                    });
                }
            }
            b'[' => {
                // `[` is the demonstrated super-linear backtracking vector, so
                // it carries the tighter dedicated depth cap on top of the
                // shared nesting cap. Check the tighter cap first so a deeply
                // nested `[` run is reported as the more precise complexity
                // violation rather than a generic nesting violation. Like the
                // other openers it does NOT reset the sign / `NOT` runs.
                list_depth += 1;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(b'[');
                if list_depth > MAX_LIST_NESTING_DEPTH {
                    return Err(ParserError::ComplexityLimitExceeded {
                        limit: MAX_LIST_NESTING_DEPTH,
                        span: point_span(index),
                    });
                }
                depth += 1;
                if depth > MAX_NESTING_DEPTH {
                    return Err(ParserError::NestingLimitExceeded {
                        limit: MAX_NESTING_DEPTH,
                        span: point_span(index),
                    });
                }
                if exceeds_recursion_budget(depth, sign_run, not_run, case_depth) {
                    return Err(ParserError::ComplexityLimitExceeded {
                        limit: MAX_RECURSION_DEPTH,
                        span: point_span(index),
                    });
                }
            }
            // A closer reduces the delimiter expression, so the unary / `NOT`
            // chain that wrapped it (if any) is now complete: reset both runs.
            // `case_depth` is NOT touched — a `)` closes a paren, not a `CASE`.
            b')' | b'}' => {
                depth = depth.saturating_sub(1);
                sign_run = 0;
                not_run = 0;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(bytes[index]);
            }
            b']' => {
                depth = depth.saturating_sub(1);
                list_depth = list_depth.saturating_sub(1);
                sign_run = 0;
                not_run = 0;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(b']');
            }
            // A leading unary `+`/`-` opens one more `unary` frame. `--` is two
            // signs (not a comment); `->`/`-[r]-` and binary `a-b-c` never run
            // beyond ~2-3 before a value resets the run. It is an opener, so it
            // does not reset `not_run`; the combined budget bounds the depth.
            b'+' | b'-' => {
                sign_run += 1;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(bytes[index]);
                if exceeds_recursion_budget(depth, sign_run, not_run, case_depth) {
                    return Err(ParserError::ComplexityLimitExceeded {
                        limit: MAX_RECURSION_DEPTH,
                        span: point_span(index),
                    });
                }
            }
            // Whitespace is transparent to all counters and lookbehind (pest
            // skips it), so it leaves every counter and `prev_*` UNCHANGED.
            b' ' | b'\t' | b'\r' | b'\n' => {}
            // An identifier-start byte begins a whole word. UTF-8 lead/continuation
            // bytes (>= 0x80) route here too so a Unicode identifier (`éCASE`) is
            // consumed whole and never mis-segments its ASCII tail as a keyword.
            byte if is_word_byte_start(byte) => {
                let word_end = scan_word_chars(source, index);
                if word_end == index {
                    // A `>= 0x80` lead byte that is not an identifier char
                    // (Unicode punctuation / space): a separator/primary. Advance
                    // one whole char to keep `index` on a char boundary.
                    sign_run = 0;
                    not_run = 0;
                    prev_word = PrevWord::Other;
                    prev_sig_byte = Some(byte);
                    index += char_len_at(source, index);
                    continue;
                }
                // A keyword in any of these contexts is a `prop_ident`/`param_ref`
                // identifier, not the keyword. The lookbehind (`.`/`$`, AS/YIELD)
                // and `:`/`::` lookahead skip whitespace+comments, mirroring pest's
                // non-atomic `~`.
                let in_ident_pos = matches!(prev_sig_byte, Some(b'.') | Some(b'$'))
                    || matches!(prev_word, PrevWord::As | PrevWord::Yield)
                    || next_sig_is_colon(bytes, word_end);
                let class = classify_word(&source[index..word_end]);
                match class {
                    WordClass::Not if !in_ident_pos => {
                        not_run += 1;
                        if exceeds_recursion_budget(depth, sign_run, not_run, case_depth) {
                            return Err(ParserError::ComplexityLimitExceeded {
                                limit: MAX_RECURSION_DEPTH,
                                span: point_span(index),
                            });
                        }
                    }
                    WordClass::Case if !in_ident_pos => {
                        // A real `CASE` opener. `case_depth` is a *monotone* sound
                        // over-approximation of open-`CASE` stack pressure: it never
                        // decrements (a byte scan cannot reliably tell a keyword
                        // `END` closer from an identifier `END` — e.g. a `YIELD`
                        // column — so trusting `END` as a closer is unsound). Fold
                        // the sign/`NOT` run that *wraps* this `CASE` (those frames
                        // stay open across its operands) into `case_depth`, then
                        // reset the runs so they are not double-counted. See
                        // `MAX_RECURSION_DEPTH`.
                        case_depth = case_depth
                            .saturating_add(1)
                            .saturating_add(sign_run)
                            .saturating_add(not_run);
                        sign_run = 0;
                        not_run = 0;
                        if exceeds_recursion_budget(depth, sign_run, not_run, case_depth) {
                            return Err(ParserError::ComplexityLimitExceeded {
                                limit: MAX_RECURSION_DEPTH,
                                span: point_span(index),
                            });
                        }
                    }
                    // Every other word — a value/primary, an identifier-positioned
                    // `CASE`, or any `END`/`NOT`/`AS`/`YIELD` (`AS`/`YIELD` act via
                    // the `prev_word` lookbehind below, not here) — terminates the
                    // active sign/`NOT` run but never decrements the monotone
                    // `case_depth`. In particular a keyword `END` is treated exactly
                    // like an ordinary word: it is NOT a trusted closer.
                    _ => {
                        sign_run = 0;
                        not_run = 0;
                    }
                }
                // Record the lookback for the NEXT word: only a genuine (keyword,
                // not identifier-positioned) AS/YIELD makes the following word an
                // identifier.
                prev_word = if in_ident_pos {
                    PrevWord::Other
                } else {
                    match class {
                        WordClass::As => PrevWord::As,
                        WordClass::Yield => PrevWord::Yield,
                        _ => PrevWord::Other,
                    }
                };
                // The word itself is now the predecessor — a non-`.`/`$` byte.
                prev_sig_byte = Some(b'w');
                index = word_end;
                continue;
            }
            // Any other byte (operators `.`, `,`, `*`, `<`, `>`, `:`, `$`, digits,
            // etc.) is a primary or separator: it reduces the active sign / `NOT`
            // chain. `prev_sig_byte` records it so a following word can see a
            // leading `.`/`$`.
            other => {
                sign_run = 0;
                not_run = 0;
                prev_word = PrevWord::Other;
                prev_sig_byte = Some(other);
            }
        }
        index += 1;
    }

    Ok(())
}

/// Whether the combined recursion pressure exceeds [`MAX_RECURSION_DEPTH`].
///
/// The native pest stack depth at any position is the sum of every open
/// recursion frame: open delimiters (`depth`) plus the active unary-sign chain
/// (`sign_run`) plus the `NOT` chain (`not_run`) plus the open nested-`CASE`
/// frames (`case_depth`). Bounding the SUM — not each counter independently — is
/// what prevents the product-depth vector (e.g. nested `(` each carrying a long
/// sign chain, or nested `CASE` each carrying a sign run). `saturating_add`
/// guards against overflow even though each addend is checked at +1 increments.
fn exceeds_recursion_budget(depth: u32, sign_run: u32, not_run: u32, case_depth: u32) -> bool {
    depth
        .saturating_add(sign_run)
        .saturating_add(not_run)
        .saturating_add(case_depth)
        > MAX_RECURSION_DEPTH
}

fn next_is(bytes: &[u8], index: usize, expected: u8) -> bool {
    bytes.get(index + 1).is_some_and(|value| *value == expected)
}

/// Keyword classification of a complete identifier word.
///
/// Only the keywords the guard reacts to are distinguished; everything else
/// (including all non-keyword identifiers) is [`WordClass::Other`].
enum WordClass {
    /// `NOT` — a unary-run opener.
    Not,
    /// `CASE` — opens a (monotone) nested-`CASE` frame.
    Case,
    /// `END` — the nested-`CASE` closer keyword. **Deliberately treated as an
    /// ordinary word** (no `case_depth` decrement): a byte scan cannot reliably
    /// tell a keyword `END` from an identifier `END` (a `YIELD` column, property
    /// name, or map key), so `case_depth` is a monotone over-approximation that
    /// never trusts `END` as a closer. See [`MAX_RECURSION_DEPTH`].
    End,
    /// `AS` — the following `prop_ident` is an alias identifier.
    As,
    /// `YIELD` — the following `prop_ident` is a yield-column identifier.
    Yield,
    /// Any other word.
    Other,
}

/// The last significant keyword word that makes the *next* word an identifier.
enum PrevWord {
    /// Last significant word was the `AS` keyword.
    As,
    /// Last significant word was the `YIELD` keyword.
    Yield,
    /// Anything else.
    Other,
}

/// A byte that may begin a GQL identifier word: ASCII `[A-Za-z_]` or any UTF-8
/// lead byte (`>= 0x80`).
///
/// `grammar.pest`'s `ident`/`prop_ident` admit Unicode `LETTER`, so a Unicode
/// identifier like `éCASE` must be consumed as one word — otherwise an ASCII-only
/// scan would split it and mis-read the trailing `CASE` as the keyword. Routing
/// every `>= 0x80` lead byte into the word arm (which then scans whole `char`s)
/// makes the segmentation match pest.
fn is_word_byte_start(byte: u8) -> bool {
    byte.is_ascii_alphabetic() || byte == b'_' || byte >= 0x80
}

/// Byte length of the `char` beginning at `index` (≥ 1; defaults to 1 at EOF).
fn char_len_at(source: &str, index: usize) -> usize {
    source[index..].chars().next().map_or(1, char::len_utf8)
}

/// Consume one whole Unicode identifier word starting at `start` and return the
/// byte index just past it (`== start` if the char at `start` is not an
/// identifier char).
///
/// Word boundaries match the grammar's `!(LETTER | NUMBER | "_")` keyword guard
/// over Unicode `char`s, so `NOTE`/`éCASE` are single words and never match a
/// keyword. Operates on `char`s (not bytes) so the returned index always lands on
/// a UTF-8 char boundary.
fn scan_word_chars(source: &str, start: usize) -> usize {
    let mut end = start;
    for (offset, ch) in source[start..].char_indices() {
        if ch.is_alphanumeric() || ch == '_' {
            end = start + offset + ch.len_utf8();
        } else {
            break;
        }
    }
    end
}

/// Classify a complete identifier word (ASCII-case-insensitively, matching the
/// `^"…"` case-insensitive keyword rules).
fn classify_word(word: &str) -> WordClass {
    if word.eq_ignore_ascii_case("NOT") {
        WordClass::Not
    } else if word.eq_ignore_ascii_case("CASE") {
        WordClass::Case
    } else if word.eq_ignore_ascii_case("END") {
        WordClass::End
    } else if word.eq_ignore_ascii_case("AS") {
        WordClass::As
    } else if word.eq_ignore_ascii_case("YIELD") {
        WordClass::Yield
    } else {
        WordClass::Other
    }
}

/// Whether the first significant (non-whitespace, non-comment) byte at or after
/// `from` is a `:` (covering both the `:` of a record/property/map key and the
/// `::` of a typed record field — `::` starts with `:`).
///
/// Skips whitespace and both comment forms because pest's non-atomic `~` allows
/// them between a `prop_ident` key and its `:` (`{ CASE /* */ : 1 }`). A word
/// followed by `:`/`::` is a key identifier, never a real `CASE` opener (no
/// grammar production places `:` after a `case_expr`). (`END` is never counted
/// regardless of position, so this lookahead is only load-bearing for `CASE`.)
fn next_sig_is_colon(bytes: &[u8], from: usize) -> bool {
    let mut index = from;
    while index < bytes.len() {
        match bytes[index] {
            b' ' | b'\t' | b'\r' | b'\n' => index += 1,
            b'/' if next_is(bytes, index, b'/') => index = skip_line_comment(bytes, index + 2),
            b'/' if next_is(bytes, index, b'*') => index = skip_block_comment(bytes, index + 2) + 1,
            b':' => return true,
            _ => return false,
        }
    }
    false
}

fn skip_single_quoted(bytes: &[u8], mut index: usize, last_quote: Option<usize>) -> usize {
    while index < bytes.len() {
        match bytes[index] {
            // `\'` where the `'` is the final quote in the input is a *dangling*
            // escape (pest `dangling_escape`): the `\` is literal and the `'`
            // closes the string. Return the `'` position so the scan resumes
            // after it and still counts any following brackets — matching pest,
            // which closes the string here too. Any other `\X` (including a
            // `\'` with a later quote — pest `escaped_quote`) escapes one byte.
            b'\\' if bytes.get(index + 1) == Some(&b'\'') && Some(index + 1) == last_quote => {
                return index + 1;
            }
            b'\\' => index += 2,
            b'\'' if next_is(bytes, index, b'\'') => index += 2,
            b'\'' => return index,
            _ => index += 1,
        }
    }
    bytes.len()
}

fn skip_double_quoted(bytes: &[u8], mut index: usize, last_quote: Option<usize>) -> usize {
    while index < bytes.len() {
        match bytes[index] {
            b'\\' if bytes.get(index + 1) == Some(&b'"') && Some(index + 1) == last_quote => {
                return index + 1;
            }
            b'\\' => index += 2,
            b'"' if next_is(bytes, index, b'"') => index += 2,
            b'"' => return index,
            _ => index += 1,
        }
    }
    bytes.len()
}

fn skip_no_escape_quoted(bytes: &[u8], mut index: usize, delimiter: u8) -> usize {
    while index < bytes.len() {
        if bytes[index] == delimiter {
            return index;
        }
        index += 1;
    }
    bytes.len()
}

fn skip_backtick_quoted(bytes: &[u8], mut index: usize, last_backtick: Option<usize>) -> usize {
    while index < bytes.len() {
        match bytes[index] {
            b'\\' if bytes.get(index + 1) == Some(&b'`') && Some(index + 1) == last_backtick => {
                return index + 1;
            }
            b'\\' => index += 2,
            b'`' if next_is(bytes, index, b'`') => index += 2,
            b'`' => return index,
            _ => index += 1,
        }
    }
    bytes.len()
}

fn skip_line_comment(bytes: &[u8], mut index: usize) -> usize {
    while index < bytes.len() {
        if bytes[index] == b'\n' {
            return index;
        }
        index += 1;
    }
    bytes.len()
}

fn skip_block_comment(bytes: &[u8], mut index: usize) -> usize {
    while index + 1 < bytes.len() {
        if bytes[index] == b'*' && bytes[index + 1] == b'/' {
            return index + 1;
        }
        index += 1;
    }
    bytes.len()
}

fn point_span(offset: usize) -> SourceSpan {
    SourceSpan::new(u32::try_from(offset).unwrap_or(u32::MAX), 1)
}