forbidden-strings 0.1.9

Out-of-band scanner for forbidden literal strings and regex patterns. Gitignore-aware, fast, dependency-light: built for CI deny-listing of leaked credentials and banned tokens.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
// What:     Integration tests for the extract -> AC -> match
//           pipeline. Exercises the actual soundness invariant the
//           UTF-8 walker bug broke: a regex rule's leading literal,
//           after extraction, must round-trip through Aho-Corasick
//           byte-matching against file content containing the same
//           bytes.
// Why:      A unit test on `walk_literal_bytes` alone can pass while
//           the end-to-end pipeline still has a different soundness
//           gap. This file plugs that hole.
// TS map:   `import { extractGatingSubstrings } from "./extract";
//           import AhoCorasick from "ahocorasick"; describe(...)`.
//
// In TS you'd write (pseudocode):
// ```ts
// import { extractGatingSubstrings } from "./extract";
// import AhoCorasick from "ahocorasick";
// describe("extract -> AC", () => { ... });
// ```

// What:     `use super::extract::extract_gating_substrings;` -- the
//           function under test, exposed `pub` from `extract.rs`.
// Why:      Avoid full-path noise.
// TS map:   `import { extractGatingSubstrings } from "./extract";`.
//
// In TS you'd write (pseudocode):
// ```ts
// import { extractGatingSubstrings } from "./extract";
// ```
use super::extract::extract_gating_substrings;
// What:     `use aho_corasick::AhoCorasick;` -- the multi-pattern
//           literal-matcher type from the `aho-corasick` crate
//           (already a project dependency).
// Why:      Build an AC from the extracted substrings and search
//           content; this is exactly what `rules.rs` does in the
//           real loader.
// TS map:   `import AhoCorasick from "ahocorasick";`.
//
// In TS you'd write (pseudocode):
// ```ts
// import AhoCorasick from "ahocorasick";
// ```
use aho_corasick::AhoCorasick;

// What:     `#[test] fn em_dash_prefix_extracts_correctly()`. Marks
//           a unit test discoverable by `cargo test`.
// Why:      Headline regression check: pre-fix, `extract_gating_substrings("—password")`
//           returned `Some(vec)` containing a 6-byte mojibake
//           string. Post-fix it must contain the original 3 bytes
//           of the em-dash followed by `password`.
// TS map:   `test("em-dash prefix extracts correctly", () => { ... });`.
//
// In TS you'd write (pseudocode):
// ```ts
// test("em-dash prefix extracts correctly", () => { ... });
// ```
#[test]
fn em_dash_prefix_extracts_correctly() {
    // What:     `let result = extract_gating_substrings("—password");`
    //           returns `Option<Vec<(String, bool)>>`. `Option`
    //           wraps "maybe a value" -- `Some(vec)` if extraction
    //           succeeded, `None` if the regex couldn't be soundly
    //           gated.
    // Why:      We expect `Some` here -- the pattern is a plain
    //           literal, no alternation or short-prefix issues.
    // TS map:   `const result = extractGatingSubstrings("—password");`
    //           returning `Array<{ sub: string; ci: boolean }> | null`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const result = extractGatingSubstrings("—password");
    // ```
    let result = extract_gating_substrings("—password");
    // What:     `let subs = result.expect("...");` extracts the inner
    //           `Vec` from `Some(vec)`; panics with the message if
    //           `result` is `None`. `.expect()` is the documented
    //           variant of `.unwrap()` that lets the reader see the
    //           rationale.
    // Why:      Convert the `Option` into a hard assertion so the
    //           remaining checks don't have to nest inside an
    //           `if let Some(...)`.
    // TS map:   `const subs = result!;` (non-null assertion) plus
    //           `if (subs === null) throw new Error(...)`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // if (result === null) throw new Error("expected Some for plain literal");
    // const subs = result;
    // ```
    let subs = result.expect("expected Some for plain literal");
    // What:     `assert_eq!(subs.len(), 1, "...")`. `subs.len()`
    //           returns the number of `(String, bool)` tuples in
    //           the vec. We expect exactly one (no top-level
    //           alternation in this pattern).
    // Why:      Establishes the shape before indexing into `subs[0]`.
    // TS map:   `expect(subs.length).toBe(1);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect(subs.length).toBe(1);
    // ```
    assert_eq!(subs.len(), 1, "expected exactly one substring");
    // What:     `let (substring, ci) = &subs[0];` is a destructuring
    //           pattern bind: `subs[0]` is a `(String, bool)` tuple,
    //           and `&` takes a shared reference so we don't move
    //           it out of the `Vec`. `substring` is `&String`, `ci`
    //           is `&bool`.
    // Why:      Pull both fields out by name for the asserts below.
    // TS map:   `const { sub: substring, ci } = subs[0];`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const [substring, ci] = subs[0];
    // ```
    let (substring, ci) = &subs[0];
    // What:     `assert_eq!(substring.as_bytes(), b"\xe2\x80\x94password", "...")`.
    //           `.as_bytes()` returns a `&[u8]` view of the string's
    //           underlying bytes; `b"..."` is a byte-string literal.
    // Why:      Byte-level assertion is the whole point: this check
    //           would fail loudly if the walker mojibake'd the
    //           em-dash into 6 wrong bytes.
    // TS map:   `expect([...new TextEncoder().encode(substring)]).toEqual([...]);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect([...new TextEncoder().encode(substring)]).toEqual(
    //   [0xe2, 0x80, 0x94, 0x70, 0x61, 0x73, 0x73, 0x77, 0x6f, 0x72, 0x64],
    // );
    // ```
    assert_eq!(
        substring.as_bytes(),
        b"\xe2\x80\x94password",
        "extracted substring should be the original UTF-8 bytes"
    );
    // What:     `assert!(!*ci, "...");` derefs `ci` (which was a
    //           `&bool`) and negates it; macro panics if false.
    // Why:      Pattern had no `(?i)` prefix, so the ci flag must
    //           be false.
    // TS map:   `expect(ci).toBe(false);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect(ci).toBe(false);
    // ```
    assert!(!*ci, "ci flag should be false (no (?i) prefix)");
}

#[test]
fn em_dash_prefix_round_trips_through_aho_corasick() {
    // What:     Same `extract_gating_substrings` call as before;
    //           same `.expect()` unwrap.
    // Why:      Reproduce the same fixture state for the AC test.
    // TS map:   same as above.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("—password")!;
    // ```
    let subs = extract_gating_substrings("—password")
        .expect("expected Some for plain literal");
    // What:     `let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();`
    //           is an iterator pipeline:
    //           - `subs.iter()` -- borrows each tuple, yielding
    //             `&(String, bool)`.
    //           - `.map(|(s, _)| s.as_str())` -- closure that
    //             destructures the tuple ref, ignores the bool with
    //             `_`, and converts the `&String` to `&str` via
    //             `.as_str()`. The `|...| ...` syntax is Rust's
    //             closure syntax (TS arrow `(...) => ...`).
    //           - `.collect()` -- terminal operation that builds a
    //             `Vec<&str>`. The target type is annotated on the
    //             `let` binding so `collect` knows what to produce.
    // Why:      `AhoCorasick::new` wants something iterable of
    //           string-like items; we materialise into a `Vec<&str>`
    //           for clarity.
    // TS map:   `const patterns: string[] = subs.map(([s, _]) => s);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const patterns: string[] = subs.map(([s, _]) => s);
    // ```
    let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
    // What:     `let ac = AhoCorasick::new(&patterns).expect("...");`.
    //           `AhoCorasick::new` returns `Result<AhoCorasick, BuildError>`.
    //           `.expect()` extracts the `AhoCorasick` if `Ok`, panics
    //           with the message if `Err`.
    // Why:      Build the same AC the production loader builds, so we
    //           test the actual matching behaviour the gate uses.
    // TS map:   `const ac = new AhoCorasick(patterns);` (TS lib usually
    //           throws synchronously on bad input).
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const ac = new AhoCorasick(patterns);
    // ```
    let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
    // What:     `let content = "prefix —password suffix";` -- the
    //           file-content fixture. The em-dash here is the
    //           original 3 UTF-8 bytes `\xe2\x80\x94`, since Rust
    //           string literals preserve source bytes.
    // Why:      Simulate a file containing the forbidden phrase.
    // TS map:   `const content = "prefix —password suffix";`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const content = "prefix —password suffix";
    // ```
    let content = "prefix —password suffix";
    // What:     `let matches: Vec<_> = ac.find_iter(content).collect();`.
    //           - `ac.find_iter(content)` -- iterator yielding one
    //             `Match` per non-overlapping hit.
    //           - `.collect()` into a `Vec<_>` -- the `_` lets Rust
    //             infer the element type (`aho_corasick::Match`).
    // Why:      We want to know that AT LEAST ONE match was
    //           reported, AND its byte offset is what we expect.
    // TS map:   `const matches = [...ac.search(content)];`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const matches = [...ac.search(content)];
    // ```
    let matches: Vec<_> = ac.find_iter(content).collect();
    // What:     `assert!(!matches.is_empty(), "...");` -- macro
    //           that panics if its first argument evaluates to
    //           false. `matches.is_empty()` is `true` iff the vec
    //           has zero elements.
    // Why:      THIS is the core soundness invariant the bug
    //           broke: pre-fix, AC had a 6-byte mojibake pattern
    //           and never matched the file's 3-byte em-dash, so
    //           `matches` was empty and the rule was silently
    //           disabled.
    // TS map:   `expect(matches.length).toBeGreaterThan(0);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect(matches.length).toBeGreaterThan(0);
    // ```
    assert!(
        !matches.is_empty(),
        "AC should find at least one match -- this is the soundness invariant the UTF-8 bug broke"
    );
    // What:     `let m = &matches[0];` takes a shared reference to
    //           the first match. `m.start()` returns the byte
    //           offset (a `usize`) where the match begins.
    // Why:      Verify the match landed at the expected position
    //           (byte 7: after `"prefix "` = 7 ASCII bytes).
    // TS map:   `expect(matches[0].start).toBe(7);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect(matches[0].start).toBe(7);
    // ```
    let m = &matches[0];
    assert_eq!(
        m.start(),
        7,
        "match should start right after 'prefix ' (7 ASCII bytes)"
    );
}

#[test]
fn case_insensitive_em_dash_prefix_extracts_correctly() {
    // What:     `(?i)—Password` -- inline-flag group `(?i)` makes
    //           the pattern case-insensitive, then `—Password` is
    //           the literal prefix. `extract_gating_substrings`
    //           strips the `(?i)`, walks the remainder, and tags
    //           the resulting substring with `ci = true`.
    // Why:      Cover the case-insensitive code path: the (?i)
    //           strip happens BEFORE the walker runs, so a buggy
    //           walker would still produce mojibake here.
    // TS map:   `extractGatingSubstrings("(?i)—Password")`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?i)—Password")!;
    // ```
    let subs = extract_gating_substrings("(?i)—Password")
        .expect("expected Some for case-insensitive literal");
    assert_eq!(subs.len(), 1);
    let (substring, ci) = &subs[0];
    assert_eq!(
        substring.as_bytes(),
        b"\xe2\x80\x94Password",
        "extracted substring should preserve original UTF-8 bytes including the capital P"
    );
    // What:     `assert!(*ci, "...");` -- derefs `ci` and asserts
    //           it's true.
    // Why:      `(?i)` was present, so the per-substring ci flag
    //           must be true. The loader uses this to route the
    //           substring into the case-insensitive AC bucket.
    //
    //           NOTE on out-of-scope limitation: aho-corasick's
    //           `ascii_case_insensitive` setting only folds ASCII
    //           letters. For em-dash this doesn't matter (em-dash
    //           has no case), but for a rule like `(?i)Café`,
    //           registering `Café` ci would NOT match `CAFÉ` in
    //           file content because `é` vs `É` is not in the
    //           ASCII fold table. That's a separate design issue,
    //           tracked as a followup; not introduced or fixed by
    //           the UTF-8 walker fix.
    // TS map:   `expect(ci).toBe(true);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect(ci).toBe(true);
    // ```
    assert!(*ci, "ci flag should be true after stripping (?i)");
}

#[test]
fn emoji_prefix_round_trips_through_aho_corasick() {
    // What:     Same shape as `em_dash_prefix_round_trips_through_aho_corasick`
    //           but with a 4-byte UTF-8 leading character `🔑`
    //           (`\xf0\x9f\x94\x91`). Exercises the maximum-width
    //           UTF-8 case end-to-end.
    // Why:      The em-dash test covers 3-byte UTF-8; this covers
    //           4-byte. Pre-fix, a `🔑secret` rule would have
    //           registered 8 mojibake bytes and never matched the
    //           file's original 4 bytes. The advisor flagged this
    //           gap during review.
    // TS map:   end-to-end pipeline assertion in TS would be the
    //           same shape with `🔑` instead of `—`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("🔑secret")!;
    // const ac = new AhoCorasick(subs.map(([s, _]) => s));
    // const matches = [...ac.search("prefix 🔑secret suffix")];
    // expect(matches.length).toBeGreaterThan(0);
    // ```
    let subs = extract_gating_substrings("🔑secret")
        .expect("expected Some for plain literal");
    assert_eq!(subs.len(), 1);
    let (substring, _ci) = &subs[0];
    assert_eq!(
        substring.as_bytes(),
        b"\xf0\x9f\x94\x91secret",
        "extracted substring should preserve the original 4-byte emoji bytes"
    );
    let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
    let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
    let content = "prefix 🔑secret suffix";
    let matches: Vec<_> = ac.find_iter(content).collect();
    assert!(
        !matches.is_empty(),
        "AC should find at least one match for the 4-byte emoji prefix"
    );
    assert_eq!(
        matches[0].start(),
        7,
        "match should start right after 'prefix ' (7 ASCII bytes)"
    );
}

#[test]
fn two_byte_utf8_prefix_round_trips_through_aho_corasick() {
    // What:     2-byte UTF-8 leading char `é` (`\xc3\xa9`)
    //           followed by `tudiant` to make a 9-byte literal
    //           prefix. Same end-to-end shape as the em-dash and
    //           emoji round-trip tests.
    // Why:      Cover the 2-byte UTF-8 path. `é` is the easiest
    //           way for a Latin-script writer to introduce a
    //           non-ASCII rule; broken extraction here would be a
    //           common foot-gun.
    // TS map:   same shape as above with `étudiant`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("étudiant")!;
    // // ... AC build + match ...
    // ```
    let subs = extract_gating_substrings("étudiant")
        .expect("expected Some for plain literal");
    assert_eq!(subs.len(), 1);
    let (substring, _ci) = &subs[0];
    assert_eq!(
        substring.as_bytes(),
        b"\xc3\xa9tudiant",
        "extracted substring should preserve the original 2-byte e-acute bytes"
    );
    let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
    let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
    let content = "prefix étudiant suffix";
    let matches: Vec<_> = ac.find_iter(content).collect();
    assert!(
        !matches.is_empty(),
        "AC should find at least one match for the 2-byte e-acute prefix"
    );
    assert_eq!(
        matches[0].start(),
        7,
        "match should start right after 'prefix ' (7 ASCII bytes)"
    );
}

#[test]
fn anchor_prefix_extracts_after_strip() {
    // What:     `^—password` starts with the `^` line-anchor.
    //           `extract_gating_substrings` should strip `^` and
    //           extract `—password` from the remainder.
    // Why:      Cover the anchor-strip code path with a non-ASCII
    //           literal. Confirms the strip-then-walk pipeline
    //           preserves UTF-8 bytes through both stages.
    // TS map:   `const subs = extractGatingSubstrings("^—password")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("^—password")!;
    // expect(subs[0].sub).toBe("—password");
    // ```
    let subs = extract_gating_substrings("^—password")
        .expect("expected Some after anchor strip");
    assert_eq!(subs.len(), 1);
    let (substring, _ci) = &subs[0];
    assert_eq!(
        substring.as_bytes(),
        b"\xe2\x80\x94password",
        "extracted substring should preserve em-dash bytes after `^` strip"
    );
}

#[test]
fn short_non_ascii_prefix_rejected_by_min_prefix_len() {
    // What:     A pattern whose extracted prefix is the single
    //           em-dash `—` (3 UTF-8 bytes) followed by a
    //           metacharacter `*`. Walker extracts `—` only; the
    //           soundness filter `subs.iter().any(|(p, _)| p.len()
    //           < MIN_PREFIX_LEN)` checks BYTE length, and `—` is
    //           exactly 3 bytes (== MIN_PREFIX_LEN), so it passes.
    // Why:      Documents the byte-length semantic: `MIN_PREFIX_LEN`
    //           is bytes, not chars. A single 3-byte UTF-8 char
    //           passes; a single 2-byte UTF-8 char does NOT.
    //           Future maintainers might assume "chars"; this test
    //           pins the actual behaviour. (The bug we just fixed
    //           was upstream of this filter; once UTF-8 is correct,
    //           `MIN_PREFIX_LEN` operates on real bytes as
    //           intended.)
    // TS map:   `const subs = extractGatingSubstrings("—.*")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("—.*"); // 3-byte prefix passes
    // expect(subs).not.toBeNull();
    // expect(subs![0].sub).toBe("—");
    // ```
    let subs = extract_gating_substrings("—.*")
        .expect("3-byte em-dash prefix should pass MIN_PREFIX_LEN");
    assert_eq!(subs.len(), 1);
    let (substring, _ci) = &subs[0];
    assert_eq!(substring.as_bytes(), b"\xe2\x80\x94");

    // What:     Confirm the negative case: a single 2-byte char
    //           prefix (`é`, 2 bytes) is rejected because 2 <
    //           MIN_PREFIX_LEN (3). `assert!(result.is_none())`
    //           checks the `Option` is `None`.
    // Why:      Pin the byte-length semantic from the other side.
    // TS map:   `expect(extractGatingSubstrings("é.*")).toBeNull();`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect(extractGatingSubstrings("é.*")).toBeNull();
    // ```
    let result = extract_gating_substrings("é.*");
    assert!(
        result.is_none(),
        "2-byte e-acute prefix is below MIN_PREFIX_LEN (bytes), should be None"
    );
}

#[test]
fn alternation_with_non_ascii_extracts_both_branches() {
    // What:     Pattern `(?:—password|—token)` -- a non-capturing
    //           group containing two branches separated by `|`.
    //           Each branch starts with em-dash. The walker
    //           recurses into the group via `skip_atom_with_extract`,
    //           splits the body on top-level `|`, and extracts one
    //           prefix per branch. Result should be a 2-element
    //           Vec, both with em-dash leading bytes.
    // Why:      Cover the multi-substring-per-rule path with
    //           non-ASCII literals. AC fires the rule if EITHER
    //           branch matches. Pre-fix, both branches would have
    //           mojibake'd, so AC would never fire.
    // TS map:   `const subs = extractGatingSubstrings("(?:—password|—token)")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?:—password|—token)")!;
    // expect(subs.length).toBe(2);
    // ```
    let subs = extract_gating_substrings("(?:—password|—token)")
        .expect("expected Some for alternation of literals");
    assert_eq!(
        subs.len(),
        2,
        "expected one substring per alternation branch"
    );
    assert_eq!(
        subs[0].0.as_bytes(),
        b"\xe2\x80\x94password",
        "first branch should be em-dash + password"
    );
    assert_eq!(
        subs[1].0.as_bytes(),
        b"\xe2\x80\x94token",
        "second branch should be em-dash + token"
    );

    // What:     Build AC from both substrings, search content
    //           containing only the second branch's literal.
    //           AC should fire on the `—token` pattern.
    // Why:      End-to-end soundness: registering BOTH branches
    //           means a file with only one of them still gates
    //           correctly.
    // TS map:   `const ac = new AhoCorasick(subs.map(([s, _]) => s));`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const ac = new AhoCorasick(subs.map(([s, _]) => s));
    // expect([...ac.search("here is —token")].length).toBeGreaterThan(0);
    // ```
    let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
    let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
    let matches: Vec<_> = ac.find_iter("here is —token").collect();
    assert!(
        !matches.is_empty(),
        "AC should fire on the second-branch literal"
    );
}

// What:     `#[test] fn positive_lookahead_at_start_extracts_after_body()`.
//           Pattern `(?=foo)bar` -- positive lookahead at the head of
//           the rule, followed by literal `bar`. Walker should skip
//           the lookahead and extract `bar`.
// Why:      Pre-fix the walker bailed at `(?=` (because
//           `group_body_start` returned `None` for that opener),
//           leaving no extracted literal and dropping the rule into
//           the residual bucket. Post-fix the lookaround is treated
//           as a transparent zero-width atom and the walker
//           continues.
// TS map:   `test("positive lookahead at start extracts after body", () => { ... });`.
//
// In TS you'd write (pseudocode):
// ```ts
// test("positive lookahead at start extracts after body", () => {
//   const subs = extractGatingSubstrings("(?=foo)bar")!;
//   expect(subs.length).toBe(1);
//   expect(subs[0].sub).toBe("bar");
// });
// ```
#[test]
fn positive_lookahead_at_start_extracts_after_body() {
    let subs = extract_gating_substrings("(?=foo)bar")
        .expect("expected Some after lookahead skip");
    assert_eq!(subs.len(), 1);
    let (substring, ci) = &subs[0];
    assert_eq!(substring.as_bytes(), b"bar");
    assert!(!*ci, "ci flag should be false (no (?i) prefix)");
}

#[test]
fn negative_lookahead_at_start_extracts_after_body() {
    // What:     `(?!foo)bar` -- negative lookahead at the head; the
    //           regex requires `bar` to NOT have `foo` immediately
    //           ahead, then match `bar`. AC gating only needs a byte
    //           sequence the regex requires somewhere in the file;
    //           `bar` is required, so it is the gate.
    // Why:      Confirms negative-flavour lookaround skipping does
    //           not accidentally try to register the lookaround body
    //           (`foo`) as a required AC literal -- that would be
    //           UNSOUND because a real match guarantees `foo` is
    //           NOT at that position.
    // TS map:   same shape as the positive case.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?!foo)bar")!;
    // expect(subs[0].sub).toBe("bar");
    // ```
    let subs = extract_gating_substrings("(?!foo)bar")
        .expect("expected Some after negative-lookahead skip");
    assert_eq!(subs.len(), 1);
    assert_eq!(subs[0].0.as_bytes(), b"bar");
}

#[test]
fn positive_lookbehind_at_start_extracts_after_body() {
    // What:     `(?<=foo)bar` -- positive lookbehind at the head.
    // Why:      Confirm the lookbehind shape (`(?<=`) is
    //           discriminated from `(?<name>` named-capture by the
    //           detector: bytes after `(?<` must be `=` or `!` to
    //           qualify as lookbehind.
    // TS map:   same shape as the positive lookahead case.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?<=foo)bar")!;
    // expect(subs[0].sub).toBe("bar");
    // ```
    let subs = extract_gating_substrings("(?<=foo)bar")
        .expect("expected Some after positive-lookbehind skip");
    assert_eq!(subs.len(), 1);
    assert_eq!(subs[0].0.as_bytes(), b"bar");
}

#[test]
fn negative_lookbehind_at_start_extracts_after_body() {
    // What:     `(?<!foo)bar` -- negative lookbehind at the head.
    // Why:      Cover the fourth lookaround flavour. Same soundness
    //           note as negative lookahead: never extract the
    //           negative-lookaround body itself.
    // TS map:   `const subs = extractGatingSubstrings("(?<!foo)bar")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?<!foo)bar")!;
    // expect(subs[0].sub).toBe("bar");
    // ```
    let subs = extract_gating_substrings("(?<!foo)bar")
        .expect("expected Some after negative-lookbehind skip");
    assert_eq!(subs.len(), 1);
    assert_eq!(subs[0].0.as_bytes(), b"bar");
}

#[test]
fn lookahead_at_end_extracts_before_body() {
    // What:     `foobar(?=baz)` -- lookahead at the END of the
    //           pattern. Walker consumes `foobar` first, then sees
    //           the lookahead and skips it; loop ends with `foobar`
    //           as the best candidate.
    // Why:      Even pre-fix, the walker probably extracted `foobar`
    //           here -- it consumed literals up to the `(`, then
    //           bailed when `skip_atom_with_extract` returned None,
    //           but `best` was already set. Post-fix the bail
    //           becomes a clean skip; behaviour shouldn't regress.
    // TS map:   `const subs = extractGatingSubstrings("foobar(?=baz)")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("foobar(?=baz)")!;
    // expect(subs[0].sub).toBe("foobar");
    // ```
    let subs = extract_gating_substrings("foobar(?=baz)")
        .expect("expected Some with literal-then-lookahead");
    assert_eq!(subs.len(), 1);
    assert_eq!(subs[0].0.as_bytes(), b"foobar");
}

#[test]
fn lookahead_in_middle_extracts_best_literal() {
    // What:     `foofoo(?=x)bar` -- literal `foofoo` (6 bytes),
    //           lookahead, literal `bar` (3 bytes). `extract_branch`
    //           picks the BEST single candidate within a branch
    //           (longest score), so `foofoo` wins over `bar`.
    // Why:      Confirm the walker continues past the lookaround
    //           and considers the trailing literal too -- the
    //           soundness invariant is that one required substring
    //           per branch suffices, and longest wins for
    //           selectivity.
    // TS map:   `const subs = extractGatingSubstrings("foofoo(?=x)bar")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("foofoo(?=x)bar")!;
    // expect(subs[0].sub).toBe("foofoo");
    // ```
    let subs = extract_gating_substrings("foofoo(?=x)bar")
        .expect("expected Some with literal-lookahead-literal");
    assert_eq!(subs.len(), 1);
    assert_eq!(
        subs[0].0.as_bytes(),
        b"foofoo",
        "extract_branch should pick the longest of the two literals"
    );
}

#[test]
fn lookahead_in_middle_picks_longer_after_skip() {
    // What:     `foo(?=x)barbaz` -- 3-byte literal, lookahead,
    //           6-byte literal. Walker must continue past the
    //           lookahead and pick `barbaz` as the more-selective
    //           candidate (6 bytes > 3 bytes).
    // Why:      Pre-fix the walker bailed at `(?=`, leaving `foo`
    //           as the gate. Post-fix it skips the lookahead and
    //           replaces `foo` with the longer trailing literal --
    //           the whole point of the perf gap this commit closes.
    // TS map:   `const subs = extractGatingSubstrings("foo(?=x)barbaz")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("foo(?=x)barbaz")!;
    // expect(subs[0].sub).toBe("barbaz");
    // ```
    let subs = extract_gating_substrings("foo(?=x)barbaz")
        .expect("expected Some after lookahead skip");
    assert_eq!(subs.len(), 1);
    assert_eq!(
        subs[0].0.as_bytes(),
        b"barbaz",
        "post-fix walker should continue past lookahead and pick the longer trailing literal"
    );
}

#[test]
fn prose_em_dash_pattern_extracts_middle_literal() {
    // What:     The user's exact pattern from the bug report:
    //           `(?<=[a-z]) -- (?=[a-z])`. Lookbehind asserts a
    //           lowercase letter just before; lookahead asserts a
    //           lowercase letter just after. The literal between
    //           the two zero-width assertions is ` -- ` (space,
    //           hyphen, hyphen, space -- 4 bytes).
    // Why:      Headline regression: pre-fix this rule had no AC
    //           gate and ran as a residual per-rule resharp scan.
    //           Post-fix it must extract ` -- ` and route to the
    //           AC prefix bucket.
    // TS map:   `const subs = extractGatingSubstrings("(?<=[a-z]) -- (?=[a-z])")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?<=[a-z]) -- (?=[a-z])")!;
    // expect(subs[0].sub).toBe(" -- ");
    // ```
    let subs = extract_gating_substrings("(?<=[a-z]) -- (?=[a-z])")
        .expect("expected Some after lookbehind+lookahead skip");
    assert_eq!(subs.len(), 1);
    assert_eq!(
        subs[0].0.as_bytes(),
        b" -- ",
        "literal between the two zero-width lookarounds should be the AC gate"
    );
}

#[test]
fn nested_lookaround_extracts_after_outer() {
    // What:     `(?=(?:foo|bar))baz` -- positive lookahead whose
    //           body is itself a non-capturing group with an
    //           internal alternation. The walker only needs to
    //           skip the OUTER lookaround group (matching close
    //           paren), not understand the inner structure.
    //           `find_matching_close_paren` tracks paren depth so
    //           the inner `)` decreases depth from 2 to 1, and the
    //           outer `)` from 1 to 0 (returning that index).
    // Why:      Confirm depth tracking works through the nested
    //           group, so the walker resumes correctly at `baz`
    //           after the outer `)`.
    // TS map:   `const subs = extractGatingSubstrings("(?=(?:foo|bar))baz")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?=(?:foo|bar))baz")!;
    // expect(subs[0].sub).toBe("baz");
    // ```
    let subs = extract_gating_substrings("(?=(?:foo|bar))baz")
        .expect("expected Some after nested-lookaround skip");
    assert_eq!(subs.len(), 1);
    assert_eq!(subs[0].0.as_bytes(), b"baz");
}

#[test]
fn lookahead_does_not_break_named_capture_path() {
    // What:     `(?<name>foo)bar` -- named capture group, NOT a
    //           lookbehind. The detector must discriminate them by
    //           the byte after `(?<`: only `=` or `!` is a
    //           lookbehind; anything else (a name character) is a
    //           named capture.
    // Why:      Regression guard: a sloppy detector that treats
    //           `(?<` as lookbehind unconditionally would break
    //           every named-capture rule by skipping its body
    //           instead of recursing into it. This test pins the
    //           discriminator.
    // TS map:   `const subs = extractGatingSubstrings("(?<name>foo)bar")!;`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?<name>foo)bar")!;
    // // Named-capture body is the required literal; recurse extracts
    // // "foo" or the longer concat -- pin the actual current behaviour.
    // ```
    let subs = extract_gating_substrings("(?<name>foo)bar")
        .expect("named-capture rule should still gate");
    // What:     `assert!(...)` macro panics if its arg evaluates to
    //           false. We accept either `foo` (group body) or
    //           `foobar` (concatenated) here; the discriminator
    //           only needs to ensure we did NOT accidentally skip
    //           the body and end up with `bar` alone.
    // Why:      The test isn't about which literal wins; it's
    //           about ensuring named captures are NOT misrouted to
    //           the lookaround skip path.
    // TS map:   `expect(["foo", "foobar"]).toContain(subs[0].sub);`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect(["foo", "foobar"]).toContain(subs[0].sub);
    // ```
    let extracted_bytes = subs[0].0.as_bytes();
    assert!(
        extracted_bytes == b"foo" || extracted_bytes == b"foobar",
        "named-capture body should still gate; got {:?}",
        subs[0].0
    );
}

#[test]
fn prose_em_dash_pattern_round_trips_through_aho_corasick() {
    // What:     End-to-end pipeline check for the user's exact
    //           pattern. Build AC from the extracted gate ` -- `
    //           and search content matching the rule.
    // Why:      Soundness invariant: registered AC pattern must
    //           appear in any string the regex matches. ` -- ` is
    //           a strict subset of the regex's required bytes, so
    //           AC must fire on it.
    // TS map:   end-to-end pipeline test in TS would be the same
    //           shape with a JS AC port.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?<=[a-z]) -- (?=[a-z])")!;
    // const ac = new AhoCorasick(subs.map(([s, _]) => s));
    // const matches = [...ac.search("hello -- world")];
    // expect(matches.length).toBeGreaterThan(0);
    // ```
    let subs = extract_gating_substrings("(?<=[a-z]) -- (?=[a-z])")
        .expect("expected Some after both lookaround skips");
    let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
    let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
    let content = "hello -- world";
    let matches: Vec<_> = ac.find_iter(content).collect();
    assert!(
        !matches.is_empty(),
        "AC should fire on ` -- ` for prose em-dash content"
    );
    assert_eq!(
        matches[0].start(),
        5,
        "match should start at byte offset 5 (after `hello`)"
    );
}

// What:     `#[test] fn inline_flag_propagates_ci_to_subsequent_literal()`.
//           BUG 1 regression test. Inline `(?i)` mid-rule must update the
//           ci context for all subsequent literals at the same scope.
// Why:      Pre-fix, `skip_atom_with_extract`'s inline-flag arm returned
//           `Some((rest, None))` without telling `extract_branch` that ci
//           had changed. The caller kept tagging subsequent literals with
//           the original ci. So `/literalA(?i)keyword-suffix/` extracted
//           `keyword-suffix` tagged ci=false, registering it in the case-
//           sensitive AC bucket; the regex itself matched `KEYWORD-SUFFIX`
//           case-insensitively but the AC gate did not, and the rule
//           silently missed. Post-fix the inline-flag arm bubbles the
//           updated ci to the caller, and `keyword-suffix` is tagged ci=true.
// TS map:   `test("inline (?i) propagates ci to subsequent literal", () => { ... })`.
//
// In TS you'd write (pseudocode):
// ```ts
// test("inline (?i) propagates ci to subsequent literal", () => {
//   const subs = extractGatingSubstrings("literalA(?i)keyword-suffix")!;
//   expect(subs[0].sub).toBe("keyword-suffix");
//   expect(subs[0].ci).toBe(true);
// });
// ```
#[test]
fn inline_flag_propagates_ci_to_subsequent_literal() {
    // What:     `literalA(?i)keyword-suffix` -- ASCII literal `literalA`
    //           (8 bytes), then an inline-flag group `(?i)` that turns on
    //           case-insensitive mode for everything that follows, then
    //           literal `keyword-suffix` (14 bytes).
    // Why:      `keyword-suffix` is the longer of the two literals so the
    //           walker picks it as the best candidate. The bug shape: its
    //           ci tag must reflect the (?i) flag set by the inline group
    //           BEFORE it appeared in source order.
    // TS map:   `extractGatingSubstrings("literalA(?i)keyword-suffix")`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("literalA(?i)keyword-suffix")!;
    // ```
    let subs = extract_gating_substrings("literalA(?i)keyword-suffix")
        .expect("expected Some for literal + inline-flag + literal pattern");
    assert_eq!(subs.len(), 1, "walker should pick a single best literal");
    let (substring, ci) = &subs[0];
    assert_eq!(
        substring.as_bytes(),
        b"keyword-suffix",
        "longer literal `keyword-suffix` (14 bytes) wins over `literalA` (8 bytes)"
    );
    assert!(
        *ci,
        "BUG 1: ci must be true after the inline (?i) flag; pre-fix this was false"
    );
}

// What:     `#[test] fn unicode_flag_disables_extraction()`. BUG 2
//           regression test. The `u` flag in the leading flag group
//           must route the rule to residual scanning instead of the AC
//           gate path.
// Why:      Pre-fix, `(?iu)cafésecret` had its literal extracted into
//           the AC-CI bucket. aho-corasick's ASCII case-fold leaves
//           `É` and `é` mismatched, so a file containing `CAFÉSECRET`
//           never fired the gate, the regex's find_all never ran, and
//           the rule silently missed. Post-fix `extract_gating_substrings`
//           returns None when the leading flag set contains `u`, and the
//           rule falls back to the residual resharp scan which handles
//           Unicode case-folding correctly.
// TS map:   `test("(?u) or (?iu) leading flag disables extraction", () => { ... })`.
//
// In TS you'd write (pseudocode):
// ```ts
// test("(?u) leading flag disables extraction", () => {
//   expect(extractGatingSubstrings("(?iu)cafésecret")).toBeNull();
// });
// ```
#[test]
fn unicode_flag_disables_extraction() {
    // What:     Both leading-flag forms that combine Unicode mode with
    //           case-insensitive matching must return None so the rule
    //           goes to the residual scanner.
    // Why:      The AC-CI gate uses aho-corasick's ascii_case_insensitive
    //           which only folds ASCII letters; non-ASCII case-folded
    //           variants (É <-> é, Á <-> á, etc.) would be missed,
    //           making the gate unsound for the (?iu)/(?ui)/(?u) rules.
    // TS map:   `expect(extractGatingSubstrings("(?iu)cafésecret")).toBeNull();`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect(extractGatingSubstrings("(?iu)cafésecret")).toBeNull();
    // expect(extractGatingSubstrings("(?ui)cafésecret")).toBeNull();
    // expect(extractGatingSubstrings("(?u)cafésecret")).toBeNull();
    // ```
    assert!(
        extract_gating_substrings("(?iu)cafésecret").is_none(),
        "BUG 2: (?iu) leading flag must disable extraction"
    );
    assert!(
        extract_gating_substrings("(?ui)cafésecret").is_none(),
        "BUG 2: (?ui) leading flag must disable extraction"
    );
    assert!(
        extract_gating_substrings("(?u)cafésecret").is_none(),
        "BUG 2: (?u) leading flag must disable extraction (conservative)"
    );

    // What:     Plain `(?i)` (no `u`) MUST still extract -- this is the
    //           common case-insensitive shape that drains hundreds of
    //           betterleaks rules onto the AC-CI fast path. The fix
    //           must not regress it.
    // Why:      Regression guard. Without this assertion a future
    //           change that disabled extraction on ANY `i` flag would
    //           pass the negative tests but blow up perf on the corpus.
    // TS map:   `expect(extractGatingSubstrings("(?i)keyword-suffix")).not.toBeNull();`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // expect(extractGatingSubstrings("(?i)keyword-suffix")).not.toBeNull();
    // ```
    let subs = extract_gating_substrings("(?i)keyword-suffix")
        .expect("plain (?i) without u flag must still extract");
    assert_eq!(subs.len(), 1);
    assert_eq!(subs[0].0.as_bytes(), b"keyword-suffix");
    assert!(subs[0].1, "ci should be true for plain (?i)");
}

#[test]
fn inline_negated_flag_clears_ci_for_subsequent_literal() {
    // What:     `(?i)shorty(?-i)keyword-suffix` -- outer (?i) sets ci=true
    //           for the rest of the rule. Then `shorty` (6 bytes) is
    //           walked tagged ci=true. Then `(?-i)` inline group CLEARS
    //           ci for subsequent atoms. Then `keyword-suffix` (14 bytes)
    //           wins as the longer literal; it should be tagged ci=false.
    // Why:      Symmetric coverage for the (?-i) variant of the inline
    //           flag. Same bubble-up requirement, opposite direction.
    // TS map:   `extractGatingSubstrings("(?i)shorty(?-i)keyword-suffix")`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const subs = extractGatingSubstrings("(?i)shorty(?-i)keyword-suffix")!;
    // expect(subs[0].ci).toBe(false);
    // ```
    let subs = extract_gating_substrings("(?i)shorty(?-i)keyword-suffix")
        .expect("expected Some for outer (?i) + inline (?-i) + literal");
    assert_eq!(subs.len(), 1);
    let (substring, ci) = &subs[0];
    assert_eq!(substring.as_bytes(), b"keyword-suffix");
    assert!(
        !*ci,
        "BUG 1 (symmetric): inline (?-i) must clear the outer (?i) for subsequent literals"
    );
}

// What:     `#[test] fn scoped_extended_flag_disables_body_extraction()`.
//           BUG 9 regression test. A scoped flag group `(?x:body)`
//           enables free-spacing mode for `body`: whitespace inside
//           the body is treated as comment/ignore, NOT as literal
//           text. Pre-fix the scoped-flag arm of `skip_atom_with_extract`
//           passed the body verbatim to `extract_scope`, which read
//           the spaces as literal bytes and registered the substring
//           `foo bar` (with the space) in the AC bucket. AC then
//           looked for the literal `foo bar` in file content, but
//           the rule actually matches `foobar` (no space). The gate
//           never fires; the regex's `find_all` never runs; the
//           rule is silently disabled while appearing to take the
//           AC fast path.
// Why:      Soundness contract from the `extract_gating_substrings`
//           docstring: registered substrings must byte-for-byte
//           match what the regex would consume. `(?x:...)` makes
//           that mapping non-trivial without a full `x`-aware
//           rewrite of the extractor, so the safe thing is to
//           extract NOTHING from such a body and let the rule
//           fall through to residual scanning.
// TS map:   `test("(?x:body) disables body extraction", () => { ... })`.
//
// In TS you'd write (pseudocode):
// ```ts
// test("(?x:body) disables body extraction", () => {
//   const subs = extractGatingSubstrings("required_(?x:foo bar)");
//   expect(subs?.[0]?.sub).toBe("required_");
//   const onlyX = extractGatingSubstrings("(?x:foo bar)");
//   expect(onlyX).toBeNull();
// });
// ```
#[test]
fn scoped_extended_flag_disables_body_extraction() {
    // What:     A scoped `(?x:foo bar)` as the entire pattern has no
    //           surrounding literal to anchor on. With `x` set, the
    //           spaces inside are ignored by the regex engine -- the
    //           body matches `foobar` -- and the only candidates the
    //           extractor could safely register are `foo` and `bar`
    //           individually. Rather than open that complexity, we
    //           skip extraction on any `x`-scoped body and return
    //           None so the rule routes to residual.
    // Why:      Forces residual fall-through; AC gate cannot soundly
    //           represent the body.
    // TS map:   `expect(extractGatingSubstrings("(?x:foo bar)")).toBeNull();`.
    assert!(
        extract_gating_substrings("(?x:foo bar)").is_none(),
        "BUG 9: scoped (?x:body) must not extract any substring"
    );

    // What:     `required\_(?x:foo bar)` has a literal-underscore
    //           prefix outside the `(?x:...)` scope. The `\_` keeps
    //           the underscore as a literal byte (BUG 10's escape
    //           handling), so the gate is `required_` (9 bytes).
    //           If the rule used bare `_` instead, the walker would
    //           stop at the wildcard and the gate would become
    //           `required` (8 bytes) -- still long enough but a
    //           different shape; this test specifically exercises
    //           the "outer literal + scoped x body" interaction
    //           with the escape form so the assertion stays stable
    //           across BUG 10's wildcard change.
    // Why:      Regression guard: a future fix that bailed the whole
    //           rule on seeing `(?x:` would lose the outer-prefix
    //           extraction. We want the outer literal to keep its
    //           AC slot, only the body to be suppressed.
    // TS map:   `expect(extractGatingSubstrings(String.raw\`required\\_(?x:foo bar)\`)?.[0].sub).toBe("required_");`.
    let subs = extract_gating_substrings(r"required\_(?x:foo bar)")
        .expect("outer literal must still extract even with (?x:body) after");
    assert_eq!(subs.len(), 1, "expected exactly one substring (outer literal)");
    assert_eq!(subs[0].0.as_bytes(), b"required_");
}

// What:     `#[test] fn bare_underscore_wildcard_does_not_appear_in_gate()`.
//           BUG 10 (extract side). Resharp treats unescaped `_` as a
//           universal wildcard. The engine-level fix routes rules
//           containing bare `_` to resharp, but the extract pipeline
//           also needs awareness: pre-fix the literal walker greedily
//           consumed `_` as a literal byte, so a rule like `pre_post`
//           registered the substring `pre_post` (with `_`) into AC.
//           AC then looked for that literal in file content, but the
//           rule actually matches `preXpost` (where `X` is any byte) --
//           silent gate-never-fires. Post-fix the walker breaks on
//           unescaped `_` and treats it as a zero-contribution
//           wildcard atom, allowing extraction to continue past it to
//           pick up surrounding literals.
// Why:      Without this, the engine-side fix is half-completed: the
//           rule routes correctly to resharp but never gets a chance
//           to run because the AC gate is registered against the
//           wrong literal.
// TS map:   `test("bare _ wildcard skipped by extractor", () => { ... })`.
//
// In TS you'd write (pseudocode):
// ```ts
// test("bare _ wildcard skipped by extractor", () => {
//   const subs = extractGatingSubstrings("pre_post");
//   for (const [sub] of subs ?? []) {
//     expect(sub).not.toContain("_");
//   }
// });
// ```
#[test]
fn bare_underscore_wildcard_does_not_appear_in_gate() {
    // What:     `pre_post` -- `pre` and `post` flank the wildcard. The
    //           walker should pick the longer side (`post`, 4 bytes)
    //           as the gating substring. `pre` is also valid (3 bytes,
    //           meets MIN_PREFIX_LEN) but the extractor picks one --
    //           the longest. Either way, the result MUST NOT contain
    //           the literal `_`.
    // Why:      The literal `_` is wildcard in resharp; including it
    //           in the AC pattern makes the gate look for a byte that
    //           the rule does not actually require.
    // TS map:   `expect(subs[0].sub).not.toContain("_");`.
    let subs = extract_gating_substrings("pre_post")
        .expect("expected Some -- some literal side of the wildcard must extract");
    for (sub, _ci) in &subs {
        assert!(
            !sub.contains('_'),
            "BUG 10: gating substring {:?} must not contain bare `_` (resharp wildcard)",
            sub
        );
    }

    // What:     `\_` (escaped) is a literal underscore. Walker pushes
    //           `_` as literal and the gate carries it through. This
    //           regression guard prevents a future change from
    //           dropping the escape-handling path.
    // Why:      Hundreds of betterleaks GitHub PAT rules use `ghp\_`
    //           shapes; they must keep extracting `ghp_` (with the
    //           literal underscore) as their gate.
    // TS map:   `expect(extractGatingSubstrings(String.raw\`pre\\_post\`)?.[0].sub).toContain("_");`.
    let subs = extract_gating_substrings(r"pre\_post")
        .expect("expected Some for escaped-underscore literal");
    assert_eq!(subs.len(), 1, "expected one substring (the full literal)");
    assert_eq!(
        subs[0].0.as_bytes(),
        b"pre_post",
        "BUG 10 regression: escaped \\_ must keep the underscore as literal"
    );
}