forbidden-strings 0.1.9

Out-of-band scanner for forbidden literal strings and regex patterns. Gitignore-aware, fast, dependency-light: built for CI deny-listing of leaked credentials and banned tokens.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
// What:     `use super::constants::TROUBLESHOOT_REF;` imports the shared
//           troubleshooting-doc suffix from the parent engine module.
// Why:      Every rejection message should point at the same long-form
//           resharp workaround document without duplicating the path.
// TS map:   `import { TROUBLESHOOT_REF } from "./constants";`.
//
// In TS you'd write (pseudocode):
// ```ts
// import { TROUBLESHOOT_REF } from "./constants";
// ```
use super::constants::TROUBLESHOOT_REF;

// What:     `pub fn lookaround_in_complement(src: &str) -> Option<String>`
//           returns `Some(reason)` when `src` contains a `~(...)` whose
//           body holds an atom that resharp 0.5.x through 0.6.x cannot handle, and
//           `None` otherwise. The detected atoms are:
//             - `\b` (rewritten to a lookaround pair, then refused by
//               the reverse pass at `resharp-algebra/src/lib.rs:2234`)
//             - `\B` (parser falls through to the generic assertion
//               handler at `resharp-parser/src/lib.rs:1419-1424` and
//               rejects at parse time)
//             - unescaped `^` or `$` (rewritten to lookaround in
//               default-multiline mode at
//               `resharp-parser/src/lib.rs:1425-1441`, then refused)
//             - user-explicit lookaround `(?=`, `(?!`, `(?<=`, `(?<!`
//               (refused by the same reverse-pass arm)
//           The function tracks paren depth via a stack of "is this
//           open paren a complement-open" flags so we can recognise
//           when the matching close exits the complement. Character
//           class interiors `[...]` are skipped because inside a class
//           those bytes are literal, not the structural metacharacters.
// Why:      Catch the failure shape before the rule reaches
//           `resharp::Regex::new`, so the user gets an actionable
//           message that names the surface trigger ("complement body
//           contains \b") instead of resharp's opaque rendering
//           ("unsupported lookaround pattern" or
//           "UnsupportedResharpRegex"), which the user must reverse-
//           engineer back to their own input.
// TS map:   `function lookaroundInComplement(src: string): string | null`.
//
// In TS you'd write (pseudocode):
// ```ts
// function lookaroundInComplement(src: string): string | null {
//   // walk bytes; for each position, track:
//   //   inClass: are we inside `[...]`?
//   //   parenStack: bool[] -- true means the open paren was `~(`
//   // inside a complement (any `true` in the stack) and outside a class,
//   // reject \b, \B, ^, $, (?=, (?!, (?<=, (?<!.
// }
// ```
pub fn lookaround_in_complement(src: &str) -> Option<String> {
    let bytes = src.as_bytes();
    let mut i = 0usize;
    let mut in_class = false;
    // What:     `let mut paren_stack: Vec<bool> = Vec::new();`. A growable
    //           vector of `bool`. Each entry records the kind of an
    //           unclosed open paren -- `true` if the opener was `~(`,
    //           `false` for any other `(` (including non-capturing
    //           `(?:`, named `(?P<...>`, inline flags `(?i)`). On `)`
    //           we pop the top; tracking complement-ness depth-aware
    //           lets nested constructs like `~(.*(?:foo).*)` correctly
    //           identify the `~(` as the complement while the inner
    //           `(?:foo)` close does not exit the complement.
    // Why:      Without per-open kind tracking we cannot tell whether
    //           a `)` closes a complement or a regular group, so we
    //           cannot bound the complement body.
    // TS map:   `const parenStack: boolean[] = [];`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const parenStack: boolean[] = [];
    // ```
    let mut paren_stack: Vec<bool> = Vec::new();
    while i < bytes.len() {
        let c = bytes[i];
        // What:     `if c == b'\\' { ... }` handles regex escape
        //           sequences. The trigger atoms `\b` and `\B` ARE
        //           escape sequences, so we check the escapee byte
        //           BEFORE skipping past the pair. Outside a complement
        //           or inside a class, `\b` is literal-ish and we just
        //           skip the two bytes.
        // Why:      The trigger is the escape sequence itself, not the
        //           backslash. Treating `\\` as "skip 2" would let us
        //           miss `\b` and `\B` entirely.
        // TS map:   `if (c === 0x5c) { ... }` (0x5c = `\`).
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // if (c === '\\'.charCodeAt(0)) {
        //   if (inComplement && !inClass && i + 1 < bytes.length) {
        //     const e = bytes[i + 1];
        //     if (e === 'b') return msgWordBoundary();
        //     if (e === 'B') return msgNotWordBoundary();
        //   }
        //   i += 2; continue;
        // }
        // ```
        if c == b'\\' {
            let in_complement = !in_class && paren_stack.iter().any(|&k| k);
            if in_complement && i + 1 < bytes.len() {
                match bytes[i + 1] {
                    b'b' => {
                        return Some(format!(
                            "complement body contains \\b; resharp 0.5.x through 0.6.x rewrites it to an internal lookaround which the reverse pass refuses. Replace with \\W (consumes a char on each side) or literal whitespace, or move the boundary check outside the complement. {}",
                            TROUBLESHOOT_REF
                        ));
                    }
                    b'B' => {
                        return Some(format!(
                            "complement body contains \\B; resharp 0.5.x through 0.6.x rejects it at parse time when its neighbours are unclassifiable. Restructure the rule to avoid \\B inside the complement. {}",
                            TROUBLESHOOT_REF
                        ));
                    }
                    _ => {}
                }
            }
            i += 2;
            continue;
        }
        if !in_class && c == b'[' {
            in_class = true;
            i += 1;
            continue;
        }
        if in_class && c == b']' {
            in_class = false;
            i += 1;
            continue;
        }
        if !in_class {
            // What:     `let in_complement = paren_stack.iter().any(|&k| k);`
            //           returns `true` when ANY entry in the paren
            //           stack is a complement-open. Equivalent to
            //           "we are nested inside at least one `~(`".
            //           `.iter()` borrows the vec; `.any(closure)`
            //           short-circuits on the first match.
            // Why:      A `^` inside a regular group nested inside a
            //           complement (`~(foo(.|\n)*^bar)`) is still
            //           "inside the complement" for resharp's purposes;
            //           the rewrite happens regardless of intermediate
            //           non-complement parens.
            // TS map:   `const inComplement = parenStack.some(Boolean);`.
            //
            // In TS you'd write (pseudocode):
            // ```ts
            // const inComplement = parenStack.some(Boolean);
            // ```
            let in_complement = paren_stack.iter().any(|&k| k);
            if in_complement {
                if c == b'^' {
                    return Some(format!(
                        "complement body contains ^; resharp 0.5.x through 0.6.x rewrites it to a lookbehind in default-multiline mode, which the reverse pass refuses. Use \\A for whole-content start-anchor semantics, or move the anchor outside the complement. {}",
                        TROUBLESHOOT_REF
                    ));
                }
                if c == b'$' {
                    return Some(format!(
                        "complement body contains $; resharp 0.5.x through 0.6.x rewrites it to a lookahead in default-multiline mode, which the reverse pass refuses. Use \\z for whole-content end-anchor semantics, or move the anchor outside the complement. {}",
                        TROUBLESHOOT_REF
                    ));
                }
                if c == b'(' && i + 2 < bytes.len() && bytes[i + 1] == b'?' {
                    let after = bytes[i + 2];
                    if after == b'=' || after == b'!' {
                        return Some(format!(
                            "complement body contains a lookahead (?{}; the reverse pass refuses complement-of-lookaround. Lift the lookaround outside the complement. {}",
                            after as char, TROUBLESHOOT_REF
                        ));
                    }
                    if after == b'<'
                        && i + 3 < bytes.len()
                        && (bytes[i + 3] == b'=' || bytes[i + 3] == b'!')
                    {
                        return Some(format!(
                            "complement body contains a lookbehind (?<{}; the reverse pass refuses complement-of-lookaround. Lift the lookaround outside the complement. {}",
                            bytes[i + 3] as char, TROUBLESHOOT_REF
                        ));
                    }
                }
            }
            // What:     Push/pop the paren stack. Order matters: detect
            //           `~(` BEFORE the bare-`(` arm, otherwise the `~`
            //           and `(` would be pushed independently and we
            //           would miscount.
            // Why:      Maintain accurate complement-depth tracking
            //           across nested groups.
            // TS map:   The same push/pop pattern in JS.
            //
            // In TS you'd write (pseudocode):
            // ```ts
            // if (c === '~' && bytes[i+1] === '(') { parenStack.push(true); i += 2; continue; }
            // if (c === '(') { parenStack.push(false); i += 1; continue; }
            // if (c === ')') { parenStack.pop(); i += 1; continue; }
            // ```
            if c == b'~' && i + 1 < bytes.len() && bytes[i + 1] == b'(' {
                paren_stack.push(true);
                i += 2;
                continue;
            }
            if c == b'(' {
                paren_stack.push(false);
                i += 1;
                continue;
            }
            if c == b')' {
                paren_stack.pop();
                i += 1;
                continue;
            }
        }
        i += 1;
    }
    None
}

// What:     `pub fn complement_intersection_quantified_group(src: &str) -> Option<String>`
//           detects rule shapes that cause resharp's algebra
//           simplification to hang for tens of seconds or
//           indefinitely. The name is historical -- the original
//           bisect targeted `<prefix>~(\w)&(?:aaa)*`, but a second
//           fuzz timeout
//           (`timeout-0815a95346bfa16ae0c6454162d9af0b8c05779c`,
//           shape `(?i) ###(?:\s&üü)(?:####)+#@...`) showed the
//           hang also fires WITHOUT a `~(` complement -- bare
//           intersection (`&`) co-occurring with a quantified
//           group (`)*`/`)+`/etc.) is sufficient. The detector now
//           checks only those two co-occurrence triggers; the
//           complement check is omitted. The original bisect
//           reproducer from
//           `timeout-00179d433e26fbcc3bedf2b7b38b6ce1ff9e6438` was
//           `abc~(\w)&(?:aaa)*`. The compile call to resharp's
//           `Regex::new` does not terminate within libFuzzer's
//           10s per-input timeout.
//
//           Shapes confirmed to hang (>5s each via probe bisection):
//             - `abc~(\w)&(?:aaa)*`
//             - `xyz~(\w)&(?:aaaaaaaaaaaaa)*`
//             - `[_]ñe-XM1[^42v]~(\w)&(?:aaaaaaaaaaaaa)*` (original artifact)
//             - `(?:[^a]~(\w)&(?:aaaaaaaaaaaaa)*)` (with negated-class prefix)
//
//           Shapes that compile in milliseconds:
//             - `~(\w)&(?:a)*` (no prefix)
//             - `abc~(\w)&def` (no quantified group)
//             - `(?:abc~(\w)&(?:aaa)*)` (wrapped in single outer group)
//             - `x~(\w)&(?:a)*` (1-char prefix and 1-char quantified body)
//
//           Root cause traced via gdb-attach to a hung probe plus
//           reading the cloned resharp source: the hang is in
//           `resharp::prefix::calc_prefix_sets_inner` at
//           `resharp-engine/src/prefix.rs:27`. The function walks a
//           chain of regex derivatives in a `loop { ... }` searching
//           for a fixed prefix, but its `redundant` set only ever
//           holds the original `start` node and `NodeId::BOT` --
//           freshly visited nodes are never added back. For the
//           trigger shape `<prefix>~(\w)&(?:aaa)*`, the derivative
//           chain produces a sequence of unique single-target
//           nodes indefinitely (each `der` step rotates the
//           star+intersection state but never lands back on `start`
//           or `BOT`), so the loop runs without bound. The shape
//           `(?:<prefix>~(\w)&(?:aaa)*)` wrapped in a group skips
//           the slow path because the outer non-capturing group
//           changes the simplified node form that enters
//           `select_prefix`, making the derivative chain visit
//           `BOT` early. Documented as Bug E in
//           docs/troubleshooting/resharp.md; prototyped and folded into the
//           merged upstream issue (`resharp-merged-issue.local.md`), filed upstream as ieviev/resharp#5.
//
//           The pre-validator uses a coarse structural heuristic:
//           "source contains a complement (`~(`), intersection
//           (`&`), AND a quantified group (`)*`, `)+`, `)?`,
//           `){...}`) outside character classes". This is
//           conservative: it would flag the OK cases above too.
//           That trade-off is acceptable because:
//             1. The production rule corpus contains zero rules
//                with both `&` and `~(` (intersection requires
//                resharp set-algebra, which is virtually never
//                authored by humans; the only `&` in the example
//                betterleaks config is inside character classes or
//                as `&amp;` HTML-escape literals).
//             2. The fuzz target only authors these shapes
//                accidentally via the structured generator; it
//                never needs them to discover the real bugs.
//             3. The cost of a false positive is "skip this rule
//                and continue"; the cost of a missed hang is
//                "libFuzzer reports a timeout and halts the run",
//                which blocks progress entirely.
// Why:      catch_unwind protects against panics but not against
//           non-termination. resharp does not expose a compile
//           timeout, and we can't safely cancel a running compile
//           from outside. The pre-validator is the only safe way
//           to skip the slow shapes without modifying resharp.
//           Upstream fix would be adding `redundant.insert(node);`
//           on each loop iteration in `calc_prefix_sets_inner`
//           so the visited-set actually accumulates across the walk.
// TS map:   `function complementIntersectionQuantifiedGroup(src: string): string | null`.
//
// In TS you'd write (pseudocode):
// ```ts
// function complementIntersectionQuantifiedGroup(src: string): string | null {
//   // walk bytes outside character classes; set hasComplement on `~(`,
//   // set hasIntersection on `&`, set hasQuantifiedGroup on `)` followed
//   // by `*`/`+`/`?`/`{N`. Return reason when all three are present.
// }
// ```
pub fn complement_intersection_quantified_group(src: &str) -> Option<String> {
    let bytes = src.as_bytes();
    let mut i = 0usize;
    let mut in_class = false;
    let mut has_intersection = false;
    let mut has_quantified_group = false;
    while i < bytes.len() {
        let c = bytes[i];
        if c == b'\\' {
            i += 2;
            continue;
        }
        if !in_class && c == b'[' {
            in_class = true;
            i += 1;
            continue;
        }
        if in_class && c == b']' {
            in_class = false;
            i += 1;
            continue;
        }
        if !in_class {
            if c == b'&' {
                has_intersection = true;
            }
            // What:     `)` followed by `*`/`+`/`?`/`{N` is a
            //           quantified group close. Same recognition
            //           rule used by `nested_grouped_quantifier`.
            // Why:      The trigger shape needs the quantified
            //           group; bare `)` not followed by a
            //           quantifier doesn't reproduce the hang.
            if c == b')' && i + 1 < bytes.len() {
                let next = bytes[i + 1];
                if matches!(next, b'*' | b'+' | b'?')
                    || (next == b'{'
                        && i + 2 < bytes.len()
                        && bytes[i + 2].is_ascii_digit())
                {
                    has_quantified_group = true;
                }
            }
            if has_intersection && has_quantified_group {
                return Some(format!(
                    "intersection (`&`) co-occurring with a quantified group (`(...)`*/+/?/{{N}}) triggers a known resharp 0.5.x through 0.6.x prefix-loop non-termination during `Regex::new` (see docs/troubleshooting/resharp.md Bug E -- `calc_prefix_sets_inner` lacks a visited-set update on each iteration). The compile does not terminate within libFuzzer's per-input timeout. Reproducers: `abc~(\\w)&(?:aaa)*` and `(?i) ###(?:\\s&üü)(?:####)+...`. Rewrite the rule to inline the quantified group's body into the intersection operand, or remove one of the two operators. {}",
                    TROUBLESHOOT_REF
                ));
            }
        }
        i += 1;
    }
    None
}

// What:     `pub fn nested_complement(src: &str) -> Option<String>`
//           detects rule shapes containing one resharp complement
//           `~(...)` whose body contains another `~(...)` complement
//           (back-to-back `~(~(...))` or transparent-group-separated
//           `~((?:~(...)))`). Decoded from artifact
//           `timeout-95f5e661c596e4b5a12e9841cda2e3ba242ecf7a` after
//           the bias commit's generator now produces such shapes.
// Why:      Resharp's algebra simplifier computes a complement via
//           DFA derivative; computing the complement of a complement
//           does not short-circuit to identity in 0.6.x and instead
//           walks both derivative chains. Probed compile time:
//             - `~(~(quantified_ws_chain))` -- 916 ms
//             - `~((?:~(quantified_ws_chain)))` -- 913 ms
//             - `~(quantified_ws_chain)` (single) -- 1.84 ms
//           Under cargo-fuzz's ASAN build the 900 ms compile
//           amplifies past libFuzzer's 10 s per-input timeout (the
//           timeout artifact reproduced in 31 s through the fuzz
//           binary). Source-shape rejection avoids the wall-clock
//           burn.
//
//           Detection criterion (single-pass byte walker with a
//           per-paren is-complement-frame stack):
//             - Walk `src`, skipping escape pairs `\X` and class
//               interiors `[...]`.
//             - On `~(`, FIRST check whether any open frame is a
//               complement; if so, return Some(reason). Then push
//               a "complement" frame onto the stack.
//             - On `(` (without preceding `~`), push a "non-
//               complement" frame; skip the `?` after `(` so the
//               body walk doesn't misread group prefixes.
//             - On `)`, pop the top frame.
//
//           The check-before-push order is critical: it ensures the
//           OUTER complement is still on the stack when the INNER
//           complement is detected. Reversing the order would miss
//           the back-to-back case (the outer's frame would be
//           pushed, then immediately the inner's, with no
//           opportunity to detect that the inner is INSIDE the
//           outer's body).
//
//           Sibling complements (`~(...)&~(...)` shape used by the
//           production rule
//           `RELEASE_TAG_[a-f0-9]{32}&~(RELEASE_TAG_(00){16})&~(...)`)
//           do NOT trigger this detector: the first complement
//           closes (popping its frame) before the second opens, so
//           the second `~(` sees a stack with no complement frames.
//
//           Place the call in `compile_rule_src`'s resharp branch:
//           `~(` is itself a `requires_resharp` trigger, so the
//           validator only fires for rules that would route to
//           resharp anyway.
// TS map:   `function nestedComplement(src: string): string | null`.
//
// In TS you'd write (pseudocode):
// ```ts
// function nestedComplement(src: string): string | null {
//   // walk bytes; maintain stack of bool (true=complement frame);
//   // skip \X escapes and [class] bodies;
//   // on `~(`: check `stack.some(b => b)` first (return if true);
//   //   then push true; skip 2 bytes;
//   // on `(`: push false; skip 1 byte; skip `?` after `(`;
//   // on `)`: pop.
// }
// ```
pub fn nested_complement(src: &str) -> Option<String> {
    let bytes = src.as_bytes();
    let mut i = 0usize;
    let mut in_class = false;
    let mut stack: Vec<bool> = Vec::new();
    while i < bytes.len() {
        let c = bytes[i];
        if c == b'\\' {
            i += 2;
            continue;
        }
        if !in_class && c == b'[' {
            in_class = true;
            i += 1;
            continue;
        }
        if in_class {
            if c == b']' {
                in_class = false;
            }
            i += 1;
            continue;
        }
        // What:     `~(` opens a complement. Check first whether we
        //           are inside another complement; if so, fire.
        //           Otherwise push a "complement" frame and skip
        //           past `~(`.
        // Why:      Resharp's complement-of-complement evaluation
        //           walks both derivative chains and takes
        //           hundreds of milliseconds; under ASAN that
        //           amplifies past libFuzzer's timeout.
        // TS map:   `if (c === 0x7e && bytes[i+1] === 0x28) { if (stack.some(b => b)) return reason; stack.push(true); i += 2; continue; }`.
        if c == b'~' && bytes.get(i + 1).copied() == Some(b'(') {
            if stack.iter().any(|&b| b) {
                return Some(format!(
                    "nested complement `~(~(...))` at byte offset {}: a complement appears inside another complement's body. Resharp's algebra simplifier computes complements via DFA derivative; complement-of-complement walks both derivative chains without identity short-circuit, taking hundreds of milliseconds to compile. Under ASAN the cost amplifies past libFuzzer's 10s per-input timeout. Flatten the nesting (one complement at a time, joined via `&` if combining), or eliminate the outer complement.",
                    i
                ));
            }
            stack.push(true);
            i += 2;
            continue;
        }
        if c == b'(' {
            stack.push(false);
            i += 1;
            if i < bytes.len() && bytes[i] == b'?' {
                i += 1;
            }
            continue;
        }
        if c == b')' {
            stack.pop();
            i += 1;
            continue;
        }
        i += 1;
    }
    None
}