forbidden-strings 0.1.9

Out-of-band scanner for forbidden literal strings and regex patterns. Gitignore-aware, fast, dependency-light: built for CI deny-listing of leaked credentials and banned tokens.
Documentation
// What:     `pub fn nested_quantifier_after_wildcard(src: &str) -> Option<String>`
//           detects rule shapes where a bare `_` wildcard (the
//           scanner's `_` triad atom, expanded to wildcard during
//           resharp dispatch) is immediately followed by three or
//           more consecutive `)`+quantifier adjacencies. The shape
//           the fuzz target emits is `(?:(?:(?:(?:_){5,6}){5,12})+`
//           and similar, decoded from artifacts
//           `slow-unit-8c4172d7d381b5c64c5aba568217c38c5ce94945`
//           (compile 409ms + scan 1.16s) and
//           `slow-unit-709cb39b5255ddf0721c435159191d03aa0498ea`
//           (compile 4.33s).
// Why:      Each `){N,M}` on a group multiplies the inner atom's NFA
//           state count by up to M. `_` is the largest atom in the
//           dispatch surface (matches every byte / Unicode code
//           point), so depth-3 nesting on `_` produces hundreds of
//           thousands of effective inner repetitions and resharp's
//           NFA construction takes from hundreds of milliseconds to
//           several seconds. libFuzzer's `report_slow_units` flag
//           records these as slow-unit artifacts; the fuzz target
//           burns budget re-running them. The existing
//           `nested_grouped_quantifier` detector (threshold 4)
//           passes these depth-3 shapes through to resharp because
//           depth-3 with literal innermost atom (e.g. `(?:(?:(?:a)*)*)*`)
//           does compile in milliseconds. Distinguishing on the
//           innermost atom kind without rewriting the existing
//           walker is cleanest as a separate, targeted pre-validator
//           that fires only when the chain follows `_` directly.
//
//           Detection criterion (single-pass byte walker):
//             - Walk `src`, skipping escape pairs `\X` and class
//               interiors `[...]` (the `_` inside a class is a
//               literal underscore byte, not the wildcard triad).
//             - At every bare `_` byte outside a class, count the
//               immediately-following chain of `)`+quantifier pairs.
//             - If the chain length reaches 3, return Some(reason).
//
//           Threshold: chain >= 3 (one less than
//           `nested_grouped_quantifier`'s threshold of 4). Empirical
//           timings (probed against resharp 0.6.0 with
//           overflow-checks=on): depth-3 on `_` ranges 400ms-4.3s;
//           depth-2 on `_` (chain 2) compiles in <50ms. Catching at
//           chain=3 is the precise inflection point and keeps the
//           false-positive risk at zero against the production
//           ruleset (max chain in `forbidden-strings.local.txt` is
//           1; max chain in any production rule is 1).
//
//           Place the call in `compile_rule_src`'s resharp branch:
//           bare `_` outside a class is a `requires_resharp` trigger,
//           so the validator only fires for rules that would route
//           to resharp anyway. The plain-regex path never sees the
//           `_` wildcard triad and the validator would be dead code
//           on that branch.
// TS map:   `function nestedQuantifierAfterWildcard(src: string): string | null`.
//
// In TS you'd write (pseudocode):
// ```ts
// function nestedQuantifierAfterWildcard(src: string): string | null {
//   // walk bytes; skip \X escapes and [class] bodies;
//   // at every bare `_` outside a class, count consecutive `)`+quant
//   // pairs that follow; if >= 3, return reason.
// }
// ```
pub fn nested_quantifier_after_wildcard(src: &str) -> Option<String> {
    let bytes = src.as_bytes();
    let mut i = 0usize;
    let mut in_class = false;
    while i < bytes.len() {
        let c = bytes[i];
        // What:     Escape pair `\X` is two bytes representing one atom.
        // Why:      `\_` is an escaped underscore literal, not the
        //           wildcard triad; `\(` / `\)` would corrupt the
        //           class-tracking state.
        // TS map:   `if (c === 0x5c) { i += 2; continue; }`.
        if c == b'\\' {
            i += 2;
            continue;
        }
        // What:     Character class entry / exit. Inside `[...]`, `_`
        //           is a literal underscore byte, NOT the wildcard
        //           triad.
        // Why:      `[_]` matches the single byte `_`; only bare `_`
        //           outside a class expands to wildcard in this
        //           scanner's dispatch.
        // TS map:   `if (!inClass && c === 0x5b) { inClass = true; i++; continue; }`.
        if !in_class && c == b'[' {
            in_class = true;
            i += 1;
            continue;
        }
        if in_class {
            if c == b']' {
                in_class = false;
            }
            i += 1;
            continue;
        }
        // What:     At every bare `_` outside a class, count the
        //           immediately-following chain of `)`+quantifier
        //           pairs. If >= 3, return reason.
        // Why:      The 8c41 / 709c slow-unit shape is exactly this:
        //           a `_` followed by three or more close+quantifier
        //           adjacencies.
        // TS map:   `if (c === 0x5f) { const chain = countCloseQuantChain(bytes, i+1); if (chain >= 3) return reason; }`.
        if c == b'_' {
            let chain = count_close_quant_chain_after(bytes, i + 1);
            if chain >= 3 {
                return Some(format!(
                    "nested quantifier on bare wildcard `_` at byte offset {}: chain of {} consecutive `)`+quantifier pairs immediately follows the `_`. The `_` triad expands to a wildcard atom (matches every char); nesting quantifiers depth 3+ on a wildcard explodes resharp's NFA construction, taking hundreds of milliseconds to several seconds to compile. Flatten the nesting or replace `_` with a more restrictive atom.",
                    i, chain
                ));
            }
        }
        i += 1;
    }
    None
}

// What:     `pub fn nested_chain_in_lookaround_body(src: &str) -> Option<String>`
//           detects rule shapes containing a chain of three or more
//           consecutive `)`+quantifier adjacencies anywhere INSIDE
//           an open lookaround body (`(?!...)`, `(?=...)`,
//           `(?<!...)`, `(?<=...)`). Decoded from artifact
//           `slow-unit-4eabfd5c52969dcc20c2170cd30947eccf8ae62f`
//           which compiles in 1.9s before resharp returns
//           `Algebra(UnsupportedPattern)`.
// Why:      Even with a literal innermost atom (`nested_quantifier_after_wildcard`
//           does not fire), resharp's algebra simplifier struggles
//           to canonicalise the derivative of a chain-3 quantifier
//           nest that sits inside an open lookaround context. The
//           lookaround forces the simplifier to walk derivative
//           shapes per-prefix per-suffix, multiplying the work
//           proportionally to the chain's NFA size. The compile
//           wall-clocks past libFuzzer's slow-unit threshold even
//           though the eventual outcome is a clean `Err`.
//
//           Detection criterion (single-pass byte walker with a
//           lookaround-depth stack):
//             - Walk `src`, skipping escape pairs `\X` and class
//               interiors `[...]`.
//             - On `(`, push a frame: `true` if the opener is
//               `(?!` / `(?=` / `(?<!` / `(?<=`, else `false`.
//             - On `)`, pop the frame.
//             - Maintain a running `chain` counter of consecutive
//               `)`+quantifier pairs; reset on `(`, alternation `|`,
//               escape, class entry, or any other literal byte.
//             - Increment chain only when `)` is immediately followed
//               by a quantifier (`*`/`+`/`?`/`{N`). Consume the
//               quantifier and an optional lazy `?`.
//             - If chain >= 3 AND any frame currently on the stack
//               is a lookaround, return Some(reason).
//
//           The lookaround-stack is intentionally counter-style
//           (any lookaround anywhere up the stack triggers) rather
//           than nearest-lookaround: 4eab nests the chain inside
//           the OUTER `(?!...)` lookahead with an INNER `(?!...)`
//           in between; both are open during the chain. A nearest-
//           lookaround check would miss the case where the chain
//           sits two levels deep inside the outer lookaround.
//
//           Place the call in `compile_rule_src`'s resharp branch:
//           lookarounds are `requires_resharp` triggers, so the
//           validator only fires for rules that would route to
//           resharp anyway.
// TS map:   `function nestedChainInLookaroundBody(src: string): string | null`.
//
// In TS you'd write (pseudocode):
// ```ts
// function nestedChainInLookaroundBody(src: string): string | null {
//   // walk bytes; maintain a stack of bool (true=lookaround frame);
//   // skip \X escapes and [class] bodies;
//   // on `(` push frame (true if `(?!`/`(?=`/`(?<!`/`(?<=`);
//   // on `)` pop frame, then if next is quantifier, consume and
//   //   chain++; if chain >= 3 and any open frame is lookaround,
//   //   return reason;
//   // any other byte resets chain.
// }
// ```
pub fn nested_chain_in_lookaround_body(src: &str) -> Option<String> {
    let bytes = src.as_bytes();
    let mut i = 0usize;
    let mut in_class = false;
    let mut chain: usize = 0;
    // What:     `stack: Vec<bool>`. One entry per open group; `true`
    //           if the opener is a lookaround (`(?!`/`(?=`/`(?<!`/
    //           `(?<=`), `false` otherwise (`(`, `(?:`, `(?P<...>`,
    //           `(?i)`, etc.).
    // Why:      Knowing whether ANY currently-open frame is a
    //           lookaround is enough to gate the chain trigger.
    //           Storing the booleans in order lets us pop on `)`
    //           and check the stack with one pass.
    let mut stack: Vec<bool> = Vec::new();
    while i < bytes.len() {
        let c = bytes[i];
        // What:     Escape pair `\X`. Two bytes, one atom; breaks any
        //           pending close+quant chain.
        // Why:      `\)` would mis-pop a frame; `\(` would push a
        //           bogus frame.
        // TS map:   `if (c === 0x5c) { i += 2; chain = 0; continue; }`.
        if c == b'\\' {
            i += 2;
            chain = 0;
            continue;
        }
        // What:     Character class entry / exit. Inside `[...]`,
        //           parens / quantifiers / `|` are literal bytes.
        // Why:      `[)]` shouldn't pop a frame; `[*]` shouldn't be
        //           read as a quantifier.
        // TS map:   `if (!inClass && c === 0x5b) { inClass = true; chain = 0; i++; continue; }`.
        if !in_class && c == b'[' {
            in_class = true;
            chain = 0;
            i += 1;
            continue;
        }
        if in_class {
            if c == b']' {
                in_class = false;
            }
            i += 1;
            continue;
        }
        // What:     `(`: detect lookaround opener, push frame, reset
        //           chain. Skip the `?` after `(` so the body walk
        //           doesn't misread the group prefix.
        // Why:      Lookaround openers are `(?!` / `(?=` / `(?<!` /
        //           `(?<=`. Detecting at open time means the inner
        //           body walk can check `stack.iter().any(|&b| b)`
        //           in O(depth) without re-parsing.
        // TS map:   `if (c === 0x28) { stack.push(isLookaroundOpen(bytes, i)); chain = 0; i++; if (bytes[i] === 0x3f) i++; continue; }`.
        if c == b'(' {
            let is_lookaround = is_lookaround_opener(bytes, i);
            stack.push(is_lookaround);
            chain = 0;
            i += 1;
            if i < bytes.len() && bytes[i] == b'?' {
                i += 1;
            }
            continue;
        }
        // What:     `)`: pop the frame. Check the following byte for
        //           a quantifier; if present, consume it (plus lazy
        //           `?`) and increment chain. If chain >= 3 AND any
        //           STILL-OPEN frame is a lookaround, return reason.
        // Why:      Popping before the chain check is critical: the
        //           lookaround that wraps the chain must remain on
        //           the stack at trigger time. The wrap is OUTER, so
        //           it stays open until its own matching `)`.
        // TS map:   `if (c === 0x29) { stack.pop(); ...quant consume...; chain++; if (chain >= 3 && stack.some(b => b)) return reason; }`.
        if c == b')' {
            stack.pop();
            i += 1;
            let is_quant = i < bytes.len()
                && (matches!(bytes[i], b'*' | b'+' | b'?')
                    || (bytes[i] == b'{'
                        && i + 1 < bytes.len()
                        && bytes[i + 1].is_ascii_digit()));
            if is_quant {
                if bytes[i] == b'{' {
                    while i < bytes.len() && bytes[i] != b'}' {
                        i += 1;
                    }
                    if i < bytes.len() {
                        i += 1;
                    }
                } else {
                    i += 1;
                }
                if i < bytes.len() && bytes[i] == b'?' {
                    i += 1;
                }
                chain += 1;
                if chain >= 3 && stack.iter().any(|&b| b) {
                    return Some(format!(
                        "nested grouped quantifier inside lookaround body at byte offset {}: chain of {} consecutive `)`+quantifier pairs while an open lookaround frame remains higher up the stack. Resharp's algebra simplifier walks derivative shapes per-prefix per-suffix inside lookarounds, multiplying the chain's NFA cost by the lookaround context size; the compile wall-clocks past libFuzzer's slow-unit threshold even when the eventual outcome is `Err(UnsupportedPattern)`. Flatten the nesting or move the chain outside the lookaround body.",
                        i, chain
                    ));
                }
            } else {
                chain = 0;
            }
            continue;
        }
        // What:     Any other byte (atom, anchor, `|`, literal) breaks
        //           the chain.
        // Why:      Chain requires BACK-TO-BACK close+quant.
        chain = 0;
        i += 1;
    }
    None
}

// What:     `fn count_close_quant_chain_after(bytes: &[u8], start: usize) -> usize`
//           returns the number of consecutive `)`+quantifier pairs
//           starting at `start`. Used by `nested_quantifier_after_wildcard`
//           to count the chain after a `_`.
// Why:      Sharing the chain-counting logic via a helper avoids
//           duplicating the quantifier-parsing rules across the two
//           detectors.
// TS map:   `function countCloseQuantChainAfter(bytes: Uint8Array, start: number): number`.
fn count_close_quant_chain_after(bytes: &[u8], start: usize) -> usize {
    let mut i = start;
    let mut chain = 0usize;
    while i < bytes.len() && bytes[i] == b')' {
        i += 1;
        let is_quant = i < bytes.len()
            && (matches!(bytes[i], b'*' | b'+' | b'?')
                || (bytes[i] == b'{'
                    && i + 1 < bytes.len()
                    && bytes[i + 1].is_ascii_digit()));
        if !is_quant {
            break;
        }
        if bytes[i] == b'{' {
            while i < bytes.len() && bytes[i] != b'}' {
                i += 1;
            }
            if i < bytes.len() {
                i += 1;
            }
        } else {
            i += 1;
        }
        if i < bytes.len() && bytes[i] == b'?' {
            i += 1;
        }
        chain += 1;
    }
    chain
}

// What:     `fn is_lookaround_opener(bytes: &[u8], i: usize) -> bool`
//           returns true if `bytes[i..]` starts with `(?!`/`(?=`/
//           `(?<!`/`(?<=`.
// Why:      Detect lookaround openers without committing to a full
//           regex parser. Used by `nested_chain_in_lookaround_body`.
// TS map:   `function isLookaroundOpener(bytes: Uint8Array, i: number): boolean`.
fn is_lookaround_opener(bytes: &[u8], i: usize) -> bool {
    if bytes.get(i).copied() != Some(b'(') {
        return false;
    }
    if bytes.get(i + 1).copied() != Some(b'?') {
        return false;
    }
    match bytes.get(i + 2).copied() {
        Some(b'!') | Some(b'=') => true,
        Some(b'<') => matches!(bytes.get(i + 3).copied(), Some(b'!') | Some(b'=')),
        _ => false,
    }
}