forbidden-strings 0.1.9

Out-of-band scanner for forbidden literal strings and regex patterns. Gitignore-aware, fast, dependency-light: built for CI deny-listing of leaked credentials and banned tokens.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
// What:     `use super::constants::TROUBLESHOOT_REF;` imports the shared
//           troubleshooting-doc suffix from the parent engine module.
// Why:      Every rejection message should point at the same long-form
//           resharp workaround document without duplicating the path.
// TS map:   `import { TROUBLESHOOT_REF } from "./constants";`.
//
// In TS you'd write (pseudocode):
// ```ts
// import { TROUBLESHOOT_REF } from "./constants";
// ```
use super::constants::TROUBLESHOOT_REF;

// What:     `pub fn intersection_with_lookbehind(src: &str) -> Option<String>`
//           detects rule shapes that match resharp 0.5.x through 0.6.x's
//           lookahead-vs-lookbehind intersection debug_assert at
//           `resharp/src/engine.rs:1020` (`unexpected end 0 > N`,
//           where N varies by content length: 56 in 0.5.3, 1 in 0.6.0,
//           the assertion lives behind `debug_assert!` so release
//           returns corrupted matches silently instead). The minimal
//           reproducer is
//           `(?:(?=a)&(?<=_))` driven against a content slice of
//           at least 64 bytes; the panic fires inside
//           `scan_fwd_all` during the runtime forward scan, not
//           at compile. The detector reports the shape at
//           compile time so callers get an actionable error
//           BEFORE the rule reaches the scan path.
// Why:      Catch-and-convert via `catch_unwind` in
//           `CompiledRegex::find_all` already keeps the scanner
//           process alive; this pre-validator gives the rule
//           author a clean message ("intersection involving a
//           lookbehind") instead of a generic engine-error
//           synthetic hit. Bisection (see docs/troubleshooting/resharp.md)
//           narrowed the trigger to "intersection (`&` outside
//           class) where at least one operand contains a
//           lookbehind `(?<=` or `(?<!`". The two-lookahead
//           variant (`(?:(?=a)&(?=b))`) does not panic; it
//           returns `Algebra(UnsupportedPattern)` -- only the
//           lookbehind path hits the assertion. Detection is
//           conservative: any presence of intersection + any
//           lookbehind anywhere outside a class triggers; we
//           accept rare false positives on contrived shapes
//           (rule authors get a friendlier "rewrite without
//           intersecting a lookbehind" message instead of the
//           opaque assertion) in exchange for not having to walk
//           operand boundaries.
// TS map:   `function intersectionWithLookbehind(src: string): string | null`.
//
// In TS you'd write (pseudocode):
// ```ts
// function intersectionWithLookbehind(src: string): string | null {
//   // walk bytes outside character classes; set hasIntersection on
//   // bare `&`; set hasLookbehind on `(?<=` or `(?<!`. Return
//   // a reason when both are seen, null otherwise.
// }
// ```
pub fn intersection_with_lookbehind(src: &str) -> Option<String> {
    // What:     Single-pass walker: track `in_class` membership;
    //           on bare `&` outside class set `has_intersection`;
    //           on `(?=` / `(?!` / `(?<=` / `(?<!` outside class
    //           set `has_lookaround` and record the kind seen.
    //           Return early as soon as both are seen.
    // Why:      The original detector covered `&` + lookbehind only
    //           (the panic the user's HANDOVER bisected to).
    //           Subsequent fuzzer findings show resharp 0.5.x through
    //           0.6.x also panics on `&` + lookahead shapes via the
    //           same `engine.rs:1020` assertion. Widening keeps the
    //           detection symmetric across lookaround direction.
    //           Avoid the cost of a second pass; rule sources are
    //           short and a single linear scan is plenty.
    // TS map:   Same shape; one for-loop with two booleans.
    let bytes = src.as_bytes();
    let mut i = 0usize;
    let mut in_class = false;
    let mut has_intersection = false;
    let mut has_lookaround = false;
    // What:     `lookaround_kind: &str` records which lookaround
    //           direction triggered the flag, used in the error
    //           message so the author can find the offending
    //           assertion / lookbehind quickly. "lookbehind" or
    //           "lookahead" -- whichever was seen first.
    // Why:      Diagnostic clarity. Without it the message is
    //           generic and the rule author has to scan the source
    //           to find which lookaround direction caused the
    //           rejection.
    // TS map:   `let lookaroundKind = "";`.
    let mut lookaround_kind: &'static str = "";
    while i < bytes.len() {
        let c = bytes[i];
        // What:     Escape sequence `\X` -- skip two bytes, do
        //           not interpret the second byte as a structural
        //           character.
        // Why:      `\&` and `\(` inside the source are literal
        //           bytes the regex parser treats as the actual
        //           character, not as the metacharacter.
        // TS map:   `if (c === 0x5c) { i += 2; continue; }`.
        if c == b'\\' {
            i += 2;
            continue;
        }
        if !in_class && c == b'[' {
            in_class = true;
            i += 1;
            continue;
        }
        if in_class && c == b']' {
            in_class = false;
            i += 1;
            continue;
        }
        if !in_class {
            if c == b'&' {
                has_intersection = true;
            }
            // What:     `(?=` / `(?!` / `(?<=` / `(?<!` lookaround
            //           detection. The `(?` prefix can start many
            //           constructs:
            //             - `(?=...)` positive lookahead
            //             - `(?!...)` negative lookahead
            //             - `(?<=...)` positive lookbehind
            //             - `(?<!...)` negative lookbehind
            //             - `(?<name>...)` named capture (NOT a lookaround)
            //             - `(?:...)`, `(?i)`, `(?P<name>` etc. (also not)
            //           The discriminator is the byte after `(?`:
            //             - `=` / `!` means lookahead
            //             - `<` followed by `=` / `!` means lookbehind
            //             - any other shape is not a lookaround
            // Why:      Catch both directions of lookaround. The
            //           debug_assert at `engine.rs:1020` fires for
            //           intersection involving any lookaround, not
            //           just lookbehind specifically.
            // TS map:   `if (c === '(' && b[i+1] === '?' && (b[i+2] === '=' || b[i+2] === '!' || (b[i+2] === '<' && (b[i+3] === '=' || b[i+3] === '!'))))`.
            if c == b'(' && i + 2 < bytes.len() && bytes[i + 1] == b'?' {
                let after = bytes[i + 2];
                if after == b'=' || after == b'!' {
                    has_lookaround = true;
                    if lookaround_kind.is_empty() {
                        lookaround_kind = "lookahead";
                    }
                } else if after == b'<'
                    && i + 3 < bytes.len()
                    && (bytes[i + 3] == b'=' || bytes[i + 3] == b'!')
                {
                    has_lookaround = true;
                    if lookaround_kind.is_empty() {
                        lookaround_kind = "lookbehind";
                    }
                }
            }
            if has_intersection && has_lookaround {
                return Some(format!(
                    "intersection (`&`) involving a {} triggers a known resharp 0.5.x through 0.6.3 soundness bug: the lookbehind-stripping rewrite `strip_lb` (`resharp-algebra/src/lib.rs:2007`) leaves the lookbehind in place, so release silently returns wrong matches (debug hits a `debug_assert!`). Rewrite the rule to lift the {} outside the intersection (e.g. anchor it as a prefix), or replace it with an explicit consume of the relevant byte. {}",
                    lookaround_kind, lookaround_kind, TROUBLESHOOT_REF
                ));
            }
        }
        i += 1;
    }
    None
}

// What:     `pub fn lookaround_in_alternation_with_sibling(src: &str) -> Option<String>`
//           detects rule shapes that compile through resharp's
//           parser but trigger the `engine.rs:1020` `debug_assert!`
//           (`unexpected end 0 > N`) at scan time. The minimal
//           reproducer bisected from
//           `fuzz/artifacts/fuzz_extract_gate_soundness/crash-8cba104f0805ccb567513aff895398a4f652200c`
//           is `(a|(?![_]))(?!a)` -- a capturing alternation whose
//           branches include a negative lookahead with a single-char
//           class body, followed by ANOTHER negative lookahead. The
//           pattern compiles (because alternation provides a
//           non-lookaround branch the algebra can simplify against)
//           but scanning panics during the forward DFA pass.
//
//           Variants confirmed to trigger the same panic:
//             - `(a|(?![_]))(?![a-e-u-vaaa])` (original artifact)
//             - `(?:a|(?![_]))(?!a)` (non-capturing first group)
//             - `((?![_])|a)(?!a)` (lookaround as first alt branch)
//             - `(a|(?![X]))(?!a)` for X in `_`, `0`, `.`, `-`, `|`, `^a`
//
//           Variants that do NOT trigger:
//             - `(a|(?!a))(?!a)` (bare atom in first lookahead, not class)
//             - `(a|(?![ab]))(?!a)` (class with two chars)
//             - `(?!a)(a|(?!a))` (lookaround before alternation, not after)
//             - `(?!a)b(?!c)` (atom between two lookaheads, no alt)
// Why:      `catch_unwind` in `CompiledRegex::find_all` already
//           converts the upstream panic into `Err(())` so production
//           scanning degrades gracefully. But libFuzzer-sys's panic
//           hook calls `abort()` before `catch_unwind` can intercept
//           in the fuzz harness, so the fuzz target sees a crash on
//           every iteration that hits this shape. The pre-validator
//           rejects the shape at compile time, surfacing an
//           actionable error and skipping the input so the fuzzer
//           can continue exploring the (?u)-Unicode space the
//           soundness-by-revert phase 11 verification needs.
//
//           Detection algorithm (single-pass byte walker):
//             - Maintain a stack of `(has_alternation, has_lookaround)`
//               flags, one entry per open paren. Each `(` pushes
//               `(false, false)`; each `)` pops the top entry.
//             - On `|` outside class, set top entry's
//               `has_alternation = true` (the alternation belongs
//               to the innermost group).
//             - On `(?=` / `(?!` / `(?<=` / `(?<!`, count
//               `total_lookarounds += 1` AND mark the CURRENT top
//               of the stack (the parent group containing this
//               lookaround) `has_lookaround = true` -- this models
//               "this group's body contains a lookaround". Push
//               `(false, false)` for the lookaround's own group
//               (its body is irrelevant for our pattern).
//             - On `)` pop: if the popped frame has BOTH
//               `has_alternation && has_lookaround` AND
//               `total_lookarounds >= 2` (there is another
//               lookaround outside this group), return Some(reason).
//           Conservative: false positives are acceptable. The
//           panicking shape is virtually never authored
//           intentionally; rule authors writing intersection-of-
//           lookaround patterns are rare, and the alternative is
//           a fuzz crash on every iteration.
// TS map:   `function lookaroundInAlternationWithSibling(src: string): string | null`.
//
// In TS you'd write (pseudocode):
// ```ts
// function lookaroundInAlternationWithSibling(src: string): string | null {
//   // walk bytes; maintain paren stack of (hasAlt, hasLookaround);
//   // track totalLookarounds. On `)` pop, check the combined
//   // pattern; return reason when matched.
// }
// ```
pub fn lookaround_in_alternation_with_sibling(src: &str) -> Option<String> {
    let bytes = src.as_bytes();
    let mut i = 0usize;
    let mut in_class = false;
    // What:     `paren_stack: Vec<(bool, bool)>` per-open-paren flags.
    //           Each entry `(has_alternation, has_lookaround)`
    //           records two facts about the open group's body so
    //           far: "did we see a `|` at this depth" and "does
    //           this group's body contain at least one lookaround".
    // Why:      The panicking shape has alt+lookaround inside one
    //           group; we need per-depth tracking because flat
    //           counters would confuse "alternation in group A,
    //           lookaround in group B" with the real pattern
    //           "alternation AND lookaround both in group A".
    // TS map:   `const parenStack: Array<[boolean, boolean]> = [];`.
    let mut paren_stack: Vec<(bool, bool)> = Vec::new();
    // What:     `total_lookarounds: usize` counts every lookaround
    //           opening in the source, regardless of depth or
    //           position. Used in the final-check to know whether
    //           the alt+la group had a sibling lookaround anywhere
    //           else in the source.
    // Why:      The panicking shape always has TWO or more
    //           lookarounds in the source; one alone (even inside
    //           alternation) doesn't trigger.
    // TS map:   `let totalLookarounds = 0;`.
    let mut total_lookarounds: usize = 0;
    // What:     `found_alt_la_group: bool` is set when ANY closed
    //           group's body had both alternation and at least one
    //           lookaround. Sticky -- once set, stays set.
    // Why:      The sibling lookaround may appear AFTER the
    //           alt+la group closes (e.g. `(a|(?![_]))(?!a)`); we
    //           can't fire at close time because we don't yet
    //           know about future siblings. Tracking the flag and
    //           checking at the end of the walk handles both
    //           "sibling before" and "sibling after" symmetrically.
    // TS map:   `let foundAltLaGroup = false;`.
    let mut found_alt_la_group = false;
    while i < bytes.len() {
        let c = bytes[i];
        if c == b'\\' {
            i += 2;
            continue;
        }
        if !in_class && c == b'[' {
            in_class = true;
            i += 1;
            continue;
        }
        if in_class && c == b']' {
            in_class = false;
            i += 1;
            continue;
        }
        if in_class {
            i += 1;
            continue;
        }
        // What:     Alternation `|` outside class. Marks the
        //           innermost open group as containing alternation.
        // Why:      Belongs to the innermost group; need per-depth
        //           tracking.
        if c == b'|' {
            if let Some(top) = paren_stack.last_mut() {
                top.0 = true;
            }
            i += 1;
            continue;
        }
        // What:     Group open `(`. Two cases:
        //           - Lookaround open `(?=`/`(?!`/`(?<=`/`(?<!`:
        //             count it, mark CURRENT top-of-stack as
        //             containing a lookaround, then push a fresh
        //             frame for the lookaround's own group body.
        //           - Other `(...)` (capturing, non-capturing, named,
        //             flags, comment): push a fresh frame.
        // Why:      The lookaround's PARENT group is the one
        //           containing it; the lookaround's own body is
        //           irrelevant for the pattern we're matching.
        if c == b'(' {
            let is_lookaround = i + 2 < bytes.len()
                && bytes[i + 1] == b'?'
                && (matches!(bytes[i + 2], b'=' | b'!')
                    || (bytes[i + 2] == b'<'
                        && i + 3 < bytes.len()
                        && matches!(bytes[i + 3], b'=' | b'!')));
            if is_lookaround {
                total_lookarounds += 1;
                if let Some(top) = paren_stack.last_mut() {
                    top.1 = true;
                }
            }
            paren_stack.push((false, false));
            i += 1;
            continue;
        }
        // What:     Group close `)`. Pop the top frame. If the
        //           popped frame had BOTH alternation AND at
        //           least one lookaround in its body, set the
        //           sticky `found_alt_la_group` flag. Also bubble
        //           the popped frame's has_lookaround up to the
        //           parent (a group contains a lookaround if any
        //           nested group did).
        // Why:      We defer the final fire-decision to end of
        //           walk because the sibling lookaround may appear
        //           AFTER the alt+la group closes. The bubble
        //           preserves the per-depth invariant: an outer
        //           group's body has a lookaround iff a nested
        //           subgroup body did.
        if c == b')' {
            let popped = paren_stack.pop().unwrap_or((false, false));
            if popped.0 && popped.1 {
                found_alt_la_group = true;
            }
            if popped.1
                && let Some(parent) = paren_stack.last_mut() {
                    parent.1 = true;
                }
            i += 1;
            continue;
        }
        i += 1;
    }
    // What:     Final check: fire when any closed group had both
    //           alternation AND a lookaround in its body. The
    //           `total_lookarounds` counter is retained for debugging
    //           but is no longer required to gate the fire decision.
    // Why:      The original detector required `total_lookarounds >= 2`
    //           on the theory that the shape always has a sibling
    //           lookaround. Bisection of
    //           `crash-c3c364eb3a03114a52015721c02cba0bf20eb496` (rendered
    //           as `(?:        4qüVk|o\w|\s(?![_]))23o:aaaaaaaaaaaaaaa`)
    //           showed that a SINGLE lookaround inside an alternation
    //           followed by literal content can also trip
    //           `engine.rs:1020` at find-all time in resharp 0.5.x to
    //           0.6.0 (fixed upstream in 0.6.x; this guard is now
    //           belt-and-suspenders). The
    //           threshold of 2 was an over-narrow heuristic from the
    //           first crash shape; widening to "any alt+la group"
    //           accepts a small false-positive rate in exchange for
    //           defense against the broader panic class.
    let _ = total_lookarounds;
    if found_alt_la_group {
        return Some(format!(
            "alternation containing a lookaround triggered a resharp 0.5.x to 0.6.0 debug_assert in the forward scan (`engine.rs:1020`; `unexpected end 0 > N`), now fixed upstream in 0.6.x; this rule is rejected conservatively (belt-and-suspenders), not to avoid a live crash. Minimal reproducers: `(a|(?![_]))(?!a)` and `(?:literal|other|x(?![_]))trailing`. Rewrite to remove the alternation, lift the lookaround outside, replace with an explicit byte consume, or split the rule into two separate patterns. {}",
            TROUBLESHOOT_REF
        ));
    }
    None
}

// What:     `pub fn intersection_with_word_end_alternation(src: &str) -> Option<String>`
//           detects rule shapes that match resharp 0.5.x through 0.6.x's
//           algebra arithmetic-overflow panic at
//           `resharp-algebra/src/lib.rs:2479`
//           (`attempt to add with overflow` inside
//           `attempt_rw_concat_2`; the `overflow-checks = true`
//           profile setting in our Cargo.toml is load-bearing for
//           this panic to fire in release). The minimum bisected
//           reproducer is `(?:\w|$)(?:(?![1g]\_X)& a)`: an
//           alternation containing both `\w` and the end-anchor
//           `$` concatenated with an intersection whose operand
//           contains a negative lookahead enclosing a character
//           class followed by additional literal bytes. The
//           overflow happens during DFA derivative construction,
//           reached from `Regex::new`.
// Why:      Bisection (see docs/troubleshooting/resharp.md) showed
//           the trigger is robust to the specific lookahead
//           class contents and the surrounding scoped-flag wrap.
//           The cheapest stable signal is "intersection (`&`
//           outside class) co-occurring with both `\w`
//           shorthand and `$` end-anchor (outside class) in the
//           same rule source". Real secret-detection rules
//           rarely combine all three -- they are either pure
//           literal-prefix patterns or simple character classes
//           -- so the false-positive rate is low. The catch_unwind
//           wrap in `compile_rule_src` is the load-bearing
//           safety net; this pre-validator turns the panic into
//           an actionable message for the common shape.
// TS map:   `function intersectionWithWordEndAlternation(src: string): string | null`.
//
// In TS you'd write (pseudocode):
// ```ts
// function intersectionWithWordEndAlternation(src: string): string | null {
//   // walk bytes outside character classes; flag `&` outside class,
//   // flag `\w`, flag `$`. Return reason when all three are present.
// }
// ```
pub fn intersection_with_word_end_alternation(src: &str) -> Option<String> {
    let bytes = src.as_bytes();
    let mut i = 0usize;
    let mut in_class = false;
    let mut has_intersection = false;
    let mut has_word_shorthand = false;
    let mut has_end_anchor = false;
    while i < bytes.len() {
        let c = bytes[i];
        if c == b'\\' && i + 1 < bytes.len() {
            // What:     Detect `\w` (and `\W`) as the word
            //           shorthand. Inside a character class the
            //           same escape compiles to the byte-set
            //           definition rather than the alternation
            //           shape; the panic correlate appears only
            //           outside a class, so we gate on
            //           `!in_class`.
            // Why:      Match the shape we bisected to a panic.
            // TS map:   `if (b[i+1] === 'w' || b[i+1] === 'W')`.
            if !in_class && (bytes[i + 1] == b'w' || bytes[i + 1] == b'W') {
                has_word_shorthand = true;
            }
            i += 2;
            continue;
        }
        if !in_class && c == b'[' {
            in_class = true;
            i += 1;
            continue;
        }
        if in_class && c == b']' {
            in_class = false;
            i += 1;
            continue;
        }
        if !in_class {
            if c == b'&' {
                has_intersection = true;
            }
            if c == b'$' {
                has_end_anchor = true;
            }
            if has_intersection && has_word_shorthand && has_end_anchor {
                return Some(format!(
                    "intersection (`&`) co-occurring with `\\w` shorthand and `$` end-anchor triggers a known resharp 0.5.x through 0.6.x arithmetic-overflow panic in `attempt_rw_concat_2` (`resharp-algebra/src/lib.rs:2479`). Rewrite the rule to avoid this combination -- typically by replacing `\\w` with an explicit character class (`[A-Za-z0-9_]`) or by lifting the end-anchor outside the intersection. {}",
                    TROUBLESHOOT_REF
                ));
            }
        }
        i += 1;
    }
    None
}