forbidden-strings 0.1.9

Out-of-band scanner for forbidden literal strings and regex patterns. Gitignore-aware, fast, dependency-light: built for CI deny-listing of leaked credentials and banned tokens.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
// What:     `use resharp::Regex;` imports the resharp regex type.
//           Used inside `load_ruleset` for the (smaller) regex bucket
//           on rules that use set-algebra; rules without set-algebra
//           go through the `regex` crate via `CompiledRegex::Plain`.
// Why:      Hybrid engine dispatch: this module owns the per-rule
//           routing decision via `requires_resharp`.
// TS map:   `import { Regex } from "resharp";`.
//
// In TS you'd write (pseudocode):
// ```ts
// import { Regex } from "resharp";
// ```
use resharp::Regex;

// What:     `use std::panic::{catch_unwind, AssertUnwindSafe};` brings
//           the panic-recovery primitives into scope for the
//           compile-time wrap on `Regex::new`. Full primer at the
//           same import in `src/rules/engine.rs`. Short version:
//           `catch_unwind(closure)` runs the closure with an unwind
//           barrier; an inner `panic!` becomes the outer `Err` arm
//           instead of propagating through the call stack.
//           `AssertUnwindSafe(...)` asserts to the compiler that
//           the captures are sound across the panic boundary --
//           `&str` already is `UnwindSafe`, but `catch_unwind` still
//           wants the wrapper at the closure boundary for the
//           future-`Send` requirement, so we keep the symmetric
//           shape with `engine.rs`.
// Why:      Resharp 0.5.x through 0.6.x `Regex::new` panics on some
//           rule shapes the fuzzer discovered (e.g. `(?:\w|$)(?:(?![1g]
//           \_X)& a)` triggers an arithmetic overflow inside
//           resharp-algebra's `attempt_rw_concat_2` at
//           `resharp-algebra/src/lib.rs:2470`; verified unchanged
//           between 0.5.3 and 0.6.0 by `/tmp/probe-resharp-06`).
//           Without `catch_unwind` the
//           panic aborts the scanner process during the parallel
//           regex-compile phase, taking every other in-flight
//           compile down with it. With `catch_unwind` the bad rule
//           returns a normal `Err(String)` that the loader bubbles
//           up to the user with the same `rule on line N (resharp): ...`
//           prefix as every other compile failure.
// TS map:   `try { ... } catch (e) { ... }`.
//
// In TS you'd write (pseudocode):
// ```ts
// // No equivalent. Rust requires catch_unwind + AssertUnwindSafe to
// // intercept panics across a closure boundary.
// ```
use std::panic::{catch_unwind, AssertUnwindSafe};

// What:     `use super::{...};` imports helpers re-exported by the
//           parent `rules` module. `super` means "the module above
//           this one".
// Why:      The compile pipeline applies engine routing and every
//           structural pre-validator before constructing a regex, and
//           importing through `super` keeps the parent public surface
//           marked as used in normal builds.
// TS map:   `import { requiresResharp, stackedQuantifier } from "./rules";`.
//
// In TS you'd write (pseudocode):
// ```ts
// import { requiresResharp, stackedQuantifier } from "./rules";
// ```
use super::{
    complement_intersection_quantified_group,
    intersection_with_lookbehind,
    intersection_with_word_end_alternation,
    lookaround_in_alternation_with_sibling,
    lookaround_in_complement,
    nested_chain_in_lookaround_body,
    nested_complement,
    nested_grouped_quantifier,
    nested_lookahead_in_quantified_group,
    nested_quantifier_after_wildcard,
    nesting_depth,
    quantified_lookahead_with_sibling_content,
    requires_resharp,
    stacked_quantifier,
    CompiledRegex,
};

// What:     The byte alternation that matches every Unicode whitespace
//           code point as its UTF-8 byte sequence. Each `\xHH` literal
//           in the regex source compiles to one byte under the regex
//           crate's `unicode(false)` mode; the alternation is then a
//           cheap NFA branch (no per-codepoint table). Coverage:
//           - U+00A0 NBSP                       `\xc2\xa0`
//           - U+1680 OGHAM SPACE MARK           `\xe1\x9a\x80`
//           - U+180E MONGOLIAN VOWEL SEPARATOR  `\xe1\xa0\x8e`
//           - U+2000..U+200A (en quad..hair)    `\xe2\x80[\x80-\x8a]`
//           - U+2028 LINE SEPARATOR             `\xe2\x80\xa8`
//           - U+2029 PARAGRAPH SEPARATOR        `\xe2\x80\xa9`
//           - U+202F NARROW NO-BREAK SPACE      `\xe2\x80\xaf`
//           - U+205F MEDIUM MATH SPACE          `\xe2\x81\x9f`
//           - U+3000 IDEOGRAPHIC SPACE          `\xe3\x80\x80`
//           - U+FEFF ZERO-WIDTH NO-BREAK SPACE  `\xef\xbb\xbf`
// Why:      Closes BUG 8 without forcing rules onto `unicode(true)`
//           compile. Pre-fix, `(?i)adafruit[\s]+=` against `adafruit\xc2\xa0=`
//           silently missed because `unicode(false)` treats `\s` as the
//           ASCII subset `[\t\n\v\f\r\x20]`. The previous fix forced
//           those rules to `unicode(true)` -- correct but ~90x more
//           expensive to compile (224 rules -> 478 ms phase 1, 64x
//           wall-time regression). Source-level expansion keeps every
//           rule on the `unicode(false)` fast path while widening the
//           class to cover the Unicode whitespace bytes.
// TS map:   `const UNICODE_WS_ALT = "\\xc2\\xa0|...";`.
//
// In TS you'd write (pseudocode):
// ```ts
// const UNICODE_WS_ALT = String.raw`\xc2\xa0|...`;
// ```
const UNICODE_WS_ALT: &str = r"\xc2\xa0|\xe1\x9a\x80|\xe1\xa0\x8e|\xe2\x80[\x80-\x8a\xa8\xa9\xaf]|\xe2\x81\x9f|\xe3\x80\x80|\xef\xbb\xbf";

// What:     `fn scan_class(bytes, start) -> Option<(usize, bool)>`
//           walks a character class starting at `bytes[start] == b'['`
//           and returns `(close_offset, contains_s)` -- the index of
//           the matching `]` AND whether the class contains an
//           unescaped `\s` shorthand. Handles the corner cases:
//           - Leading `[^` (negation flag) does not start a body.
//           - A literal `]` at body-start position (`[]a-z]` or
//             `[^]a-z]`) is not the terminator.
//           - `\X` escapes consume two bytes (so `\]` inside the class
//             does NOT terminate it).
// Why:      The source rewrite below needs to know two things about
//           each class: where it ends (so we can splice in the
//           Unicode-WS alternation around it) and whether it actually
//           contains `\s` (so we only widen classes that need it).
//           Returns `None` for an unterminated class -- the caller
//           treats this as "do not rewrite; let the regex compiler
//           emit its own parse error."
// TS map:   `function scanClass(bytes: Uint8Array, start: number)
//                              : { close: number; containsS: boolean } | null`.
fn scan_class(bytes: &[u8], start: usize) -> Option<(usize, bool)> {
    let mut j = start + 1;
    if j < bytes.len() && bytes[j] == b'^' {
        j += 1;
    }
    if j < bytes.len() && bytes[j] == b']' {
        j += 1;
    }
    let mut contains_s = false;
    while j < bytes.len() {
        let b = bytes[j];
        if b == b'\\' && j + 1 < bytes.len() {
            if bytes[j + 1] == b's' {
                contains_s = true;
            }
            j += 2;
            continue;
        }
        if b == b']' {
            return Some((j, contains_s));
        }
        j += 1;
    }
    None
}

// What:     `fn utf8_width(leading: u8) -> usize` returns how many
//           bytes the UTF-8 sequence starting with `leading` occupies.
//           ASCII (< 0x80) -> 1, two-byte leading (0xc0-0xdf) -> 2,
//           three-byte (0xe0-0xef) -> 3, four-byte (0xf0-0xf7) -> 4.
//           A continuation byte (0x80-0xbf) is not a valid leading byte
//           in well-formed UTF-8; the function returns 1 defensively so
//           a single-byte step advances the cursor and the caller does
//           not stall on malformed input.
// Why:      The source rewrite must copy multi-byte UTF-8 sequences
//           verbatim. A bare `bytes[i] as char` cast would mojibake
//           non-ASCII bytes; using `&src[i..i+width]` preserves the
//           UTF-8 encoding.
// TS map:   `function utf8Width(b: number): number`.
fn utf8_width(leading: u8) -> usize {
    if leading < 0xc0 {
        1
    } else if leading < 0xe0 {
        2
    } else if leading < 0xf0 {
        3
    } else {
        4
    }
}

// What:     `fn expand_unicode_whitespace(src) -> String` rewrites the
//           regex source so `\s` matches Unicode whitespace under
//           `unicode(false)` compile mode. Transformations:
//           - `\s` outside a character class -> `(?:\s|<UNICODE_WS_ALT>)`.
//             The `\s` inside the group still expands to ASCII WS
//             under `unicode(false)`; the alternation adds the
//             multi-byte UTF-8 sequences for the remaining whitespace
//             code points.
//           - `[...\s...]` (class containing unescaped `\s`) ->
//             `(?:[...\s...]|<UNICODE_WS_ALT>)`. The class itself is
//             preserved (matches its ASCII subset under
//             `unicode(false)`); the wrapping group adds the
//             multi-byte sequences. Semantic shift: under PCRE/Unicode
//             a class character takes one position, while the
//             expanded multi-byte UTF-8 here also occupies one
//             alternation slot. Quantifiers on the wrapped group
//             treat NBSP as a single match, which is closer to
//             author intent than the pre-fix "single byte" view.
//           - Other escape sequences (`\X`, `\n`, `\xHH`) and literal
//             characters pass through verbatim. Multi-byte UTF-8
//             literals are preserved using `utf8_width`.
// Why:      Source-level expansion keeps every rule on the
//           `unicode(false)` fast path (~5ms phase 1) while making
//           `\s` honour the user's authoring intent that a rule like
//           `(?i)adafruit[\s]+=` matches `adafruit<NBSP>=`. The
//           previous BUG 8 fix forced these rules to `unicode(true)`,
//           costing ~478 ms phase 1 (95x regression). The rewrite
//           costs microseconds and lands the same correctness.
//           `\S` is intentionally NOT expanded: a sound "not Unicode
//           whitespace" would require subtracting multi-byte byte
//           sequences from a negated byte class, which has no clean
//           source representation. Rules using `\S` keep ASCII-only
//           semantics; document in PERF.md.
// TS map:   `function expandUnicodeWhitespace(src: string): string`.
//
// In TS you'd write (pseudocode):
// ```ts
// function expandUnicodeWhitespace(src: string): string {
//   let out = "";
//   let i = 0;
//   while (i < src.length) {
//     // ... handle \s, [...], escapes, multi-byte literals ...
//   }
//   return out;
// }
// ```
fn expand_unicode_whitespace(src: &str) -> String {
    let bytes = src.as_bytes();
    let mut out = String::with_capacity(src.len() + 64);
    let mut i = 0;
    while i < bytes.len() {
        let b = bytes[i];
        // Escape sequence: copy verbatim, with one exception (`\s`).
        if b == b'\\' && i + 1 < bytes.len() {
            let next = bytes[i + 1];
            if next == b's' {
                out.push_str("(?:\\s|");
                out.push_str(UNICODE_WS_ALT);
                out.push(')');
                i += 2;
                continue;
            }
            let escapee_width = utf8_width(next);
            out.push_str(&src[i..i + 1 + escapee_width]);
            i += 1 + escapee_width;
            continue;
        }
        // Character class: check for `\s` inside and wrap if needed.
        if b == b'[' {
            match scan_class(bytes, i) {
                Some((close_idx, contains_s)) => {
                    let class_slice = &src[i..=close_idx];
                    if contains_s {
                        out.push_str("(?:");
                        out.push_str(class_slice);
                        out.push('|');
                        out.push_str(UNICODE_WS_ALT);
                        out.push(')');
                    } else {
                        out.push_str(class_slice);
                    }
                    i = close_idx + 1;
                    continue;
                }
                None => {
                    // Unterminated class -- pass through and let the
                    // regex compiler report the parse error.
                    out.push('[');
                    i += 1;
                    continue;
                }
            }
        }
        // Multi-byte UTF-8 literal: copy verbatim.
        if b >= 0x80 {
            let width = utf8_width(b);
            out.push_str(&src[i..i + width]);
            i += width;
            continue;
        }
        // ASCII literal byte.
        out.push(b as char);
        i += 1;
    }
    out
}

// What:     `pub fn compile_rule_src(src: &str) -> Result<CompiledRegex, String>`
//           is the single source of truth for the regex compile
//           decision. It walks the routing classifier
//           (`requires_resharp`), runs the lookaround-in-complement
//           pre-flight guard when routing to resharp, and dispatches
//           to the resharp `Regex::new` or the unicode-fallback
//           `regex` builder. Returns `CompiledRegex` directly --
//           callers that need a line-indexed `RegexRule` (the
//           production loader) wrap it with the `idx` themselves.
// Why:      The plan requires fuzz_api and production to share the
//           same compile path so the AC-gate soundness fuzzer
//           exercises identical behaviour. Splitting into a thin
//           "wrap with idx" outer layer + a `compile_rule_src`
//           core gives both call sites that property.
// TS map:   `function compileRuleSrc(src: string): CompiledRegex`.
//
// In TS you'd write (pseudocode):
// ```ts
// function compileRuleSrc(src: string): CompiledRegex {
//   if (requiresResharp(src)) {
//     const reason = lookaroundInComplement(src);
//     if (reason) throw new Error(`(resharp): ${reason}`);
//     try { return { kind: "resharp", re: new Regex(src) }; }
//     catch (e) { throw new Error(`(resharp): ${e}`); }
//   }
//   return compilePlainToCompiled(src);
// }
// ```
pub fn compile_rule_src(src: &str) -> Result<CompiledRegex, String> {
    // What:     `if let Some(reason) = stacked_quantifier(src)` runs
    //           the structural pre-validator first. The detector flags
    //           two regex quantifier suffixes appearing back-to-back
    //           without an atom between them (`a**`,
    //           `\D{5,11}{5,11}`, `(?:a){2}{3}`). Both engines reject
    //           or wall-clock on the shape: the `regex` crate's
    //           NFA-construction reaches the 256 MB DFA size limit and
    //           takes ~1.4-1.5 seconds to error on the first attempt
    //           and the same again on the unicode(true) retry --
    //           ~2.9 seconds total per `compile_rule_src` call, which
    //           libFuzzer's `report_slow_units` flags after ASAN
    //           overhead pushes a single fuzz iteration past 10s.
    //           Resharp's parser rejects the same shape in
    //           microseconds with `UnsupportedResharpRegex`, but the
    //           shape lacks any `requires_resharp` trigger and never
    //           reaches that engine in production. The pre-validator
    //           closes the gap.
    // Why:      Stacked quantifiers are virtually never authored
    //           intentionally; rejecting them at the source-level
    //           pre-validator surfaces a clear error in microseconds
    //           instead of burning the libFuzzer slow-unit budget on
    //           one input. Placed BEFORE `requires_resharp` so the
    //           error namespace reads as "the source shape is
    //           structurally bad", not "the plain path specifically
    //           dislikes it".
    // TS map:   `const reason = stackedQuantifier(src); if (reason) throw new Error(`(regex): ${reason}`);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const reason = stackedQuantifier(src);
    // if (reason) throw new Error(`(regex): ${reason}`);
    // ```
    if let Some(reason) = stacked_quantifier(src) {
        return Err(format!("(regex): {}", reason));
    }
    // What:     `if let Some(reason) = nested_grouped_quantifier(src)`
    //           catches the GROUPED form of multiplicative quantifier
    //           blowup: chains of `){quant})` adjacencies four or more
    //           deep, the shape the fuzz target's `Node::Quant`
    //           renderer actually emits (always wraps in `(?:...)`).
    //           Without this guard, the slow-unit shape
    //           `(?iu)(?:(?:(?:(?:(?:\d){5,11}){5,11}){5,11}){5,11}){5,11}(?:(?:(?:(?:(?:\d)*)*)*)*)*aa`
    //           takes ~3 seconds to error with `CompiledTooBig` -- well
    //           past the libFuzzer slow-unit threshold under ASAN.
    //           Placed alongside `stacked_quantifier` (both are
    //           structural shape pre-validators that apply regardless
    //           of engine routing).
    // Why:      `stacked_quantifier` catches `\D*****` and
    //           `a{5,11}{5,11}` (bare back-to-back quantifier
    //           suffixes); `nested_grouped_quantifier` catches the
    //           wrapped form `(?:(?:a){5,11}){5,11}`-deep that the
    //           generator actually produces. Both are needed because
    //           the regex-source-shape space is wider than either
    //           detector alone covers.
    // TS map:   `const reason = nestedGroupedQuantifier(src); if (reason) throw new Error(`(regex): ${reason}`);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const reason = nestedGroupedQuantifier(src);
    // if (reason) throw new Error(`(regex): ${reason}`);
    // ```
    if let Some(reason) = nested_grouped_quantifier(src) {
        eprintln!(
            "forbidden-strings: pre-validator nested_grouped_quantifier rejected rule {:?}",
            src
        );
        return Err(format!("(regex): {}", reason));
    }
    // What:     `if requires_resharp(src) { ... } else { ... }` runs
    //           the cheap routing classifier first. Resharp-only
    //           constructs (set algebra `A&B`, complement `~(A)`,
    //           lookarounds `(?=`/`(?!`/`(?<=`/`(?<!`, bare `_`
    //           wildcard outside a class) route to resharp; every
    //           other rule rides the faster `regex` crate.
    // Why:      Match the production dispatch decision exactly --
    //           fuzz targets that compile a generated source must
    //           hit the same branch the user would.
    // TS map:   `if (requiresResharp(src)) ... else ...`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // if (requiresResharp(src)) {
    //   // resharp path
    // } else {
    //   // regex-crate path
    // }
    // ```
    if requires_resharp(src) {
        // What:     `if let Some(reason) = nesting_depth(src)` runs the
        //           nesting-depth guard FIRST among the resharp checks.
        //           `nesting_depth` returns `Some(reason)` when the rule
        //           nests groups past a safe cap. `if let Some(reason) =
        //           ...` is Rust's one-arm pattern match that binds
        //           `reason` only in the present (`Some`) case.
        // Why:      Deeply nested complement (`~(...)`) or lookaround
        //           (`(?=...)`) groups aborted the scanner with an
        //           uncatchable stack overflow inside resharp's
        //           `Regex::new` through 0.6.8 (Bug G). resharp 0.6.9 caps
        //           parser recursion at the same depth upstream, but this
        //           pre-validator still rejects on the source shape before
        //           any other check or `Regex::new` as belt-and-suspenders:
        //           catch_unwind cannot intercept a stack-overflow SIGABRT.
        // TS map:   `const reason = nestingDepth(src); if (reason) throw new Error(`(resharp): ${reason}`);`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const reason = nestingDepth(src);
        // if (reason) throw new Error(`(resharp): ${reason}`);
        // ```
        if let Some(reason) = nesting_depth(src) {
            eprintln!(
                "forbidden-strings: pre-validator nesting_depth rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     `if let Some(reason) = lookaround_in_complement(src)`
        //           runs the resharp pre-flight guard. The function
        //           returns `Some(reason_string)` when the source
        //           contains a `~(...)` complement whose body holds
        //           a `\b`/`\B`/`^`/`$` or user-explicit lookaround
        //           (resharp 0.5.x through 0.6.x rejects those shapes
        //           with opaque errors). Returning early here surfaces an
        //           actionable message instead of resharp's
        //           internal error.
        // Why:      Identical pre-flight to production. The fuzzer
        //           must trip exactly the same guard the user would
        //           when authoring a complement-body lookaround.
        // TS map:   `const reason = lookaroundInComplement(src); if (reason) throw new Error(...);`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const reason = lookaroundInComplement(src);
        // if (reason) throw new Error(`(resharp): ${reason}`);
        // ```
        if let Some(reason) = lookaround_in_complement(src) {
            eprintln!(
                "forbidden-strings: pre-validator lookaround_in_complement rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     Two additional pre-validators for resharp panic /
        //           silent-corruption shapes the fuzzer discovered in
        //           0.5.x and re-verified against 0.6.0. Both are
        //           defined alongside `lookaround_in_complement` in
        //           `engine.rs`; each returns `Some(reason)` when its
        //           known-bad shape is detected and `None` otherwise.
        //           Returning early surfaces an actionable message
        //           before resharp's `Regex::new` reaches the
        //           panicking / corrupting code path. Note: one of the
        //           two shapes (`intersection_with_lookbehind`) panics
        //           in `engine.rs:1020` behind a `debug_assert!`; in
        //           release that path returns wrong matches instead of
        //           panicking, so the pre-validator is the only defense
        //           (catch_unwind cannot catch what does not panic).
        // Why:      `catch_unwind` below is the load-bearing safety
        //           net for arbitrary upstream panics, but the panic
        //           messages it surfaces are generic ("panic during
        //           compile") and tell the rule author nothing about
        //           why the rule is bad. These pre-validators name
        //           the structural trigger for the two shapes we
        //           have bisected and let the author rewrite the
        //           rule into a supported form. See
        //           docs/troubleshooting/resharp.md for the bisection
        //           record and rewrite recipes.
        // TS map:   `for (const check of [intersectionWithLookbehind, intersectionWithWordEndAlternation]) { const r = check(src); if (r) throw new Error(`(resharp): ${r}`); }`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // for (const check of [intersectionWithLookbehind, intersectionWithWordEndAlternation]) {
        //   const r = check(src);
        //   if (r) throw new Error(`(resharp): ${r}`);
        // }
        // ```
        if let Some(reason) = intersection_with_lookbehind(src) {
            eprintln!(
                "forbidden-strings: pre-validator intersection_with_lookbehind rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        if let Some(reason) = intersection_with_word_end_alternation(src) {
            eprintln!(
                "forbidden-strings: pre-validator intersection_with_word_end_alternation rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     `lookaround_in_alternation_with_sibling` catches
        //           the shape `(a|(?![X]))(?!Y)` and variants -- an
        //           alternation containing a lookaround followed by
        //           another lookaround. Bisected from
        //           `crash-8cba104f0805ccb567513aff895398a4f652200c`.
        //           Compiles through resharp's parser but trips the
        //           `engine.rs:1020` debug_assert on the forward DFA
        //           scan; the panic aborts the fuzz process before
        //           `catch_unwind` in `CompiledRegex::find_all` can
        //           intercept (libFuzzer-sys's panic hook calls abort
        //           first).
        // Why:      The original HANDOVER assumed the panic shape was
        //           `&` + lookahead; bisection of the actual crash
        //           artifact revealed the shape is alternation-with-
        //           lookaround + sibling-lookaround instead. The
        //           generalised `intersection_with_lookbehind`
        //           (renamed conceptually to "with any lookaround")
        //           handles the original `&`+lookahead case
        //           defensively; this new pre-validator handles the
        //           shape actually appearing in the fuzz corpus.
        // TS map:   `const reason = lookaroundInAlternationWithSibling(src); if (reason) throw new Error(`(resharp): ${reason}`);`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const reason = lookaroundInAlternationWithSibling(src);
        // if (reason) throw new Error(`(resharp): ${reason}`);
        // ```
        if let Some(reason) = lookaround_in_alternation_with_sibling(src) {
            eprintln!(
                "forbidden-strings: pre-validator lookaround_in_alternation_with_sibling rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     `complement_intersection_quantified_group` catches
        //           the shape `<prefix>~(\w)&(?:...)*` that causes
        //           resharp's algebra simplifier to hang for tens of
        //           seconds or indefinitely during `Regex::new`.
        //           Bisected from
        //           `timeout-00179d433e26fbcc3bedf2b7b38b6ce1ff9e6438`.
        //           catch_unwind below cannot catch non-termination,
        //           and resharp does not expose a compile timeout,
        //           so structural rejection is the only safe option.
        // Why:      The compile hangs past libFuzzer's per-input
        //           timeout (default 1200s, our fuzz run uses 10s
        //           per input), halting the run entirely. The shape
        //           is virtually never authored by humans (no rule
        //           in the production corpus combines `&` and `~(`),
        //           so the false-positive risk is theoretical only.
        // TS map:   `const reason = complementIntersectionQuantifiedGroup(src); if (reason) throw new Error(`(resharp): ${reason}`);`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const reason = complementIntersectionQuantifiedGroup(src);
        // if (reason) throw new Error(`(resharp): ${reason}`);
        // ```
        if let Some(reason) = complement_intersection_quantified_group(src) {
            eprintln!(
                "forbidden-strings: pre-validator complement_intersection_quantified_group rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     `nested_lookahead_in_quantified_group` catches the
        //           shape `(?:(?:(?!X)){m,n}){p,q}` (and `(?:(?!X){m,n}){p,q}`)
        //           where the outer quantifier has min >= 2. Bisected from
        //           `crash-06d9dd9fa1abfeec72a8154c09434b237dfc7f38` and
        //           `crash-df95fcd52de76d952ee3db291f59434ece2c0b81`. Both
        //           reproduce a u32 addition overflow at
        //           `resharp-algebra/src/lib.rs:2470` during `Regex::new`.
        //           libfuzzer-sys's panic hook calls abort before
        //           `catch_unwind` can intercept, so the structural
        //           pre-validator is the only way to keep the fuzz target
        //           moving past these shapes.
        // Why:      Without this guard the fuzz target halted with a
        //           crash artifact instead of continuing to the
        //           soundness-by-revert verification. In production
        //           (debug-assertions OFF) the same shape silently wraps
        //           to 0 and likely produces wrong matches -- another
        //           reason to reject at the boundary.
        // TS map:   `const reason = nestedLookaheadInQuantifiedGroup(src); if (reason) throw new Error(`(resharp): ${reason}`);`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const reason = nestedLookaheadInQuantifiedGroup(src);
        // if (reason) throw new Error(`(resharp): ${reason}`);
        // ```
        if let Some(reason) = nested_lookahead_in_quantified_group(src) {
            eprintln!(
                "forbidden-strings: pre-validator nested_lookahead_in_quantified_group rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     `quantified_lookahead_with_sibling_content` catches a
        //           second Bug F shape: `(?:(?!X)){m,n}<atom>` (a
        //           single variable-quantified lookahead-bearing
        //           group followed by any content at parent depth).
        //           Bisected from `crash-a219859099426658d70e90bc97f560b85f2cf256`
        //           which minimised to `(?:(?!abc)){4,12}a`. Same
        //           overflow path at `resharp-algebra/src/lib.rs:2470`
        //           as the nested-quant shape but a different upstream
        //           trigger (the trailing content feeds into the
        //           lookahead-chain derivative without an intermediate
        //           `Quant` wrap). The validator is intentionally
        //           broad: it false-positives on the safe "exact-quant"
        //           shape `(?:(?!X)){n}<atom>` and the
        //           "long-uniform-trail" shape `(?:(?!X)){m,n}aaa`,
        //           but full coverage is required to keep the fuzz
        //           target moving past Bug F. See
        //           docs/handover/forbidden-strings-fuzzing.md for the
        //           trade-off discussion.
        // Why:      Without this guard the soundness-by-revert phase
        //           11 fuzz run halts on the trailing-content Bug F
        //           shape before reaching the (?u)-Unicode case-fold
        //           soundness panic the target was built to catch.
        // TS map:   `const reason = quantifiedLookaheadWithSiblingContent(src); if (reason) throw new Error(`(resharp): ${reason}`);`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const reason = quantifiedLookaheadWithSiblingContent(src);
        // if (reason) throw new Error(`(resharp): ${reason}`);
        // ```
        if let Some(reason) = quantified_lookahead_with_sibling_content(src) {
            eprintln!(
                "forbidden-strings: pre-validator quantified_lookahead_with_sibling_content rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     `nested_quantifier_after_wildcard` catches the
        //           depth-3 nested-quantifier-on-`_`-wildcard shape
        //           decoded from slow-unit artifacts
        //           `slow-unit-8c4172d7d381b5c64c5aba568217c38c5ce94945`
        //           (compile 409ms + scan 1.16s) and
        //           `slow-unit-709cb39b5255ddf0721c435159191d03aa0498ea`
        //           (compile 4.33s). Catches at chain >= 3 immediately
        //           after a bare `_` outside a class.
        // Why:      The `_` triad expands to wildcard; nesting depth
        //           3+ quantifiers on it explodes resharp's NFA
        //           construction. libFuzzer keeps these slow units in
        //           the corpus and replays them, halving exec/s
        //           throughput. Catching at the source-shape level
        //           rejects the rule in microseconds.
        // TS map:   `const reason = nestedQuantifierAfterWildcard(src); if (reason) throw new Error(`(resharp): ${reason}`);`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const reason = nestedQuantifierAfterWildcard(src);
        // if (reason) throw new Error(`(resharp): ${reason}`);
        // ```
        if let Some(reason) = nested_quantifier_after_wildcard(src) {
            eprintln!(
                "forbidden-strings: pre-validator nested_quantifier_after_wildcard rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     `nested_chain_in_lookaround_body` catches the
        //           depth-3 nested-quantifier shape sitting inside a
        //           lookaround body, decoded from
        //           `slow-unit-4eabfd5c52969dcc20c2170cd30947eccf8ae62f`
        //           (compile 1.9s before resharp errors with
        //           `Algebra(UnsupportedPattern)`).
        // Why:      Even with a literal innermost atom, resharp's
        //           algebra simplifier walks derivative shapes per-
        //           prefix per-suffix inside lookarounds, multiplying
        //           the chain's NFA cost by the lookaround context
        //           size. The compile wall-clocks past libFuzzer's
        //           slow-unit threshold even though the eventual
        //           outcome is `Err`. Source-shape rejection avoids
        //           the wall-clock burn.
        // TS map:   `const reason = nestedChainInLookaroundBody(src); if (reason) throw new Error(`(resharp): ${reason}`);`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const reason = nestedChainInLookaroundBody(src);
        // if (reason) throw new Error(`(resharp): ${reason}`);
        // ```
        if let Some(reason) = nested_chain_in_lookaround_body(src) {
            eprintln!(
                "forbidden-strings: pre-validator nested_chain_in_lookaround_body rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     `nested_complement` catches rule shapes containing
        //           one complement `~(...)` whose body contains another
        //           complement. Decoded from
        //           `timeout-95f5e661c596e4b5a12e9841cda2e3ba242ecf7a`
        //           (the new-generator counterpart to slow-unit-4eab).
        //           Probed compile times: 916ms for `~(~(X))` and
        //           913ms for `~((?:~(X)))` (transparent-group form);
        //           1.84ms for single `~(X)`.
        // Why:      Resharp's algebra simplifier walks both derivative
        //           chains in complement-of-complement. Under ASAN
        //           the 900ms cost amplifies past libFuzzer's 10s
        //           timeout (the timeout artifact reproduced in 31s
        //           through the fuzz binary). Source-shape rejection
        //           is the only way to keep the fuzz target moving.
        //           Sibling complements `~(...)&~(...)` (production
        //           shape) are NOT caught -- the inner complement is
        //           detected only when an outer one is open.
        // TS map:   `const reason = nestedComplement(src); if (reason) throw new Error(`(resharp): ${reason}`);`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const reason = nestedComplement(src);
        // if (reason) throw new Error(`(resharp): ${reason}`);
        // ```
        if let Some(reason) = nested_complement(src) {
            eprintln!(
                "forbidden-strings: pre-validator nested_complement rejected rule {:?}",
                src
            );
            return Err(format!("(resharp): {}", reason));
        }
        // What:     `Regex::new(src).map(CompiledRegex::Resharp).map_err(...)`.
        //           `Regex::new` is resharp's compile constructor;
        //           `.map(CompiledRegex::Resharp)` wraps the
        //           successful `Regex` into the `Resharp` variant
        //           (the function reference is used in place of an
        //           explicit closure). `.map_err(...)` turns
        //           resharp's `Error` into our `String` error
        //           channel, prefixed with `(resharp):` so the
        //           outer caller can prepend `rule on line N`.
        // Why:      Produce a `CompiledRegex` ready to consume.
        // TS map:   `try { return { kind: "resharp", re: new Regex(src) }; } catch (e) { throw new Error(`(resharp): ${e}`); }`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // try { return { kind: "resharp", re: new Regex(src) }; }
        // catch (e) { throw new Error(`(resharp): ${e}`); }
        // ```
        // What:     `catch_unwind(AssertUnwindSafe(|| Regex::new(src)))`.
        //           - The outer `catch_unwind` runs the inner closure
        //             under an unwind barrier. If `Regex::new` panics
        //             during DFA construction (resharp-algebra 0.5.x
        //             through 0.6.x has an `attempt to add with overflow`
        //             panic at `lib.rs:2470` reachable from some
        //             fuzzer-discovered rule shapes; the
        //             `overflow-checks = true` profile setting in
        //             Cargo.toml is load-bearing for the panic to fire
        //             in release), the panic is caught and we surface
        //             a normal error string instead of
        //             aborting the scanner.
        //           - `AssertUnwindSafe(...)` wraps the closure so the
        //             type-checker accepts the closure's captures
        //             across the panic boundary. `&str` (the `src`
        //             capture) IS `UnwindSafe`, but the wrapper is
        //             still required because we capture by reference
        //             and `catch_unwind`'s closure bound is `FnOnce()
        //             + UnwindSafe`.
        //           - The nested `match caught` flattens the two-level
        //             `Result<Result<Regex, resharp::Error>, Box<dyn Any + Send>>`
        //             into a single `Result<CompiledRegex, String>`:
        //             outer `Err` (panic) becomes `(resharp): panic
        //             during compile`, inner `Err` becomes the
        //             standard `(resharp): <error>` shape, inner `Ok`
        //             wraps into `CompiledRegex::Resharp(...)`. The
        //             actionable detail (which rule shape) lives in
        //             `src`, which the outer loader already prepends
        //             via the `rule on line N` prefix.
        // Why:      Defense in depth. The pre-validator below catches
        //           every known panicking shape and surfaces a
        //           specific message; this wrapper is the fallback
        //           for shapes the pre-validator does not yet know
        //           about. Without the wrapper a single bad rule
        //           crashes the whole scanner; with it, the rule's
        //           line is named in the error and every other rule
        //           continues to compile.
        // TS map:   `try { return new Regex(src); } catch (e) { throw new Error(`(resharp): ${e}`); }`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // try { return { kind: "resharp", re: new Regex(src) }; }
        // catch (e) { throw new Error(`(resharp): ${e}`); }
        // ```
        let caught = catch_unwind(AssertUnwindSafe(|| Regex::new(src)));
        return match caught {
            Ok(Ok(re)) => Ok(CompiledRegex::Resharp(re)),
            Ok(Err(e)) => Err(format!("(resharp): {:?}", e)),
            Err(_) => Err(
                "(resharp): panic during compile (upstream resharp 0.5.x through 0.6.x bug). See docs/troubleshooting/resharp.md."
                    .to_string()
            ),
        };
    }
    compile_plain_rule_to_compiled(src)
}

// What:     `fn compile_plain_rule_to_compiled(src: &str) -> Result<CompiledRegex, String>`
//           is the unicode-off / unicode-on fallback compile path
//           for rules that did NOT route to resharp. Identical to
//           the previous `compile_plain_rule` body, but returns a
//           `CompiledRegex` without the rule index so it composes
//           into `compile_rule_src`.
// Why:      Keep the "fast path -> retry with unicode" mechanic
//           in one place. `compile_plain_rule` is now a thin
//           wrapper that calls this and decorates the error
//           with `rule on line N` for diagnostics.
// TS map:   `function compilePlainToCompiled(src: string): CompiledRegex`.
//
// In TS you'd write (pseudocode):
// ```ts
// function compilePlainToCompiled(src: string): CompiledRegex { ... }
// ```
fn compile_plain_rule_to_compiled(src: &str) -> Result<CompiledRegex, String> {
    // What:     `let src = &expand_unicode_whitespace(src);`. Rewrite
    //           the rule source so `\s` (free or in a class) matches
    //           Unicode whitespace UTF-8 byte sequences under the
    //           `unicode(false)` compile path. See the helper's
    //           docstring for the transformation rules.
    // Why:      Closes BUG 8 cheaply. The previous fix forced rules
    //           containing `\s` (and friends) onto `unicode(true)`,
    //           regressing phase 1 from ~5 ms to ~478 ms on the
    //           example ruleset (95x). The source-level expansion
    //           costs microseconds and stays on the fast path.
    let expanded = expand_unicode_whitespace(src);
    let src = expanded.as_str();
    {
    // What:     `if let Ok(re) = builder.build() { ... }` is a one-arm
    //           pattern match against `Result<Regex, Error>`. The block
    //           runs ONLY when `build()` returned `Ok`, binding the
    //           inner `Regex` to local `re`. The `Err` arm is implicit:
    //           when build fails, we fall through past the `if`.
    //           `RegexBuilder::new(src)` starts a fluent builder;
    //           `.unicode(false)` flips off unicode-aware semantics for
    //           speed; `.size_limit` / `.dfa_size_limit` raise the
    //           internal NFA/DFA caps from 10 MiB to 256 MiB so rules
    //           with large bounded repetitions (e.g. `[\w-]{138,300}`)
    //           still compile.
    // Why:      Try the fast path first; if the rule needs unicode
    //           features the build fails fast (parse error, no DFA built)
    //           and we fall through to the unicode-on retry below.
    // TS map:   `try { return new Regex(src, { unicode: false, ... }); } catch { /* fall through */ }`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // try {
    //   const re = buildRegex(src, { unicode: false, sizeLimit: 256 * 1024 * 1024 });
    //   return { idx, re: { kind: "plain", re } };
    // } catch { /* try unicode mode */ }
    // ```
    if let Ok(re) = regex::bytes::RegexBuilder::new(src)
        .unicode(false)
        .size_limit(256 * 1024 * 1024)
        .dfa_size_limit(256 * 1024 * 1024)
        .build()
    {
        // What:     `return Ok(RegexRule { idx, re: CompiledRegex::Plain(re) });`
        //           early-returns the success variant. `Ok(...)` wraps
        //           into the success arm of `Result`. `RegexRule { ... }`
        //           is a struct literal -- field-init shorthand `idx` is
        //           Rust sugar for `idx: idx`. `CompiledRegex::Plain(re)`
        //           constructs the `Plain` variant of the `CompiledRegex`
        //           enum, wrapping the just-compiled `regex::bytes::Regex`.
        // Why:      Hand the freshly compiled rule back to the caller as
        //           a success result.
        // TS map:   `return { idx, re: { kind: "plain", re } };` (with
        //           throwing-style errors instead of `Result`).
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // return { idx, re: { kind: "plain", re } };
        // ```
        return Ok(CompiledRegex::Plain(re));
    }
    }
    // Fall back to unicode-aware mode for rules with unicode features
    // OR rules that opted out of the fast path via needs_unicode_shorthand.
    // What:     `builder.build().map(CompiledRegex::Plain).map_err(|e| ...)`.
    //           Same fluent-builder mechanic as the fast path, but with
    //           `.unicode(true)`. On success the `Regex` is wrapped into
    //           `CompiledRegex::Plain`; on failure we format the error
    //           with `(regex):` so the outer caller can prepend the
    //           line number.
    // Why:      Some rules need unicode-aware semantics (`(?u)`, certain
    //           class shorthands); they fall through here.
    // TS map:   `try { return { kind: "plain", re: build(src, { unicode: true }) }; } catch (e) { throw new Error(`(regex): ${e}`); }`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // try {
    //   const re = buildRegex(src, { unicode: true, sizeLimit: 256 * 1024 * 1024 });
    //   return { kind: "plain", re };
    // } catch (e) {
    //   throw new Error(`(regex): ${e}`);
    // }
    // ```
    regex::bytes::RegexBuilder::new(src)
        .unicode(true)
        .size_limit(256 * 1024 * 1024)
        .dfa_size_limit(256 * 1024 * 1024)
        .build()
        .map(CompiledRegex::Plain)
        .map_err(|e| format!("(regex): {:?}", e))
}