forbidden-strings 0.1.2

Out-of-band scanner for forbidden literal strings and regex patterns. Gitignore-aware, fast, dependency-light: built for CI deny-listing of leaked credentials and banned tokens.
// What:     `use rayon::prelude::*;` brings rayon's parallel-iterator
//           traits into scope. We use it for the per-rule parallel
//           `find_all` pass on the violation slow path.
// Why:      Each regex rule has its own `Mutex<RegexInner>`, so
//           parallelizing across rules (different mutexes) is genuine
//           multi-core work, not contention.
// TS map:   No equivalent.
//
// In TS you'd write (pseudocode):
// ```ts
// // No equivalent.
// ```
use rayon::prelude::*;

// What:     `use std::collections::BTreeSet;` imports an ordered set
//           backed by a balanced binary tree. Insertions and lookups
//           are O(log n). `BTreeSet<usize>` here holds rule positions
//           that need full `find_all` after the AC pass.
// Why:      `BTreeSet` deduplicates rule positions encountered via
//           multiple AC hits AND iterates in sorted order, giving
//           deterministic per-file output ordering.
// TS map:   `new Set<number>()` -- TS sets keep insertion order; the
//           Rust BTreeSet equivalent in TS would sort manually.
//
// In TS you'd write (pseudocode):
// ```ts
// const seenRulePositions = new Set<number>();
// ```
use std::collections::BTreeSet;

// What:     `use std::sync::OnceLock;` brings the thread-safe "set
//           exactly once" cell into scope. `OnceLock<T: Send + Sync>`
//           is itself `Sync`, so a single instance can be shared by
//           reference across rayon worker threads. Concurrent callers
//           of `get_or_init` race only on the first init; the loser's
//           closure is dropped, every caller observes the same `&T`
//           afterward.
// Why:      The line-start index is built only when the first hit
//           fires (most files have zero hits and pay nothing). Once
//           built, it must be visible to AC literal-hit emission,
//           prefix-matched par_iter, and residual-shard par_iter --
//           all on the same file. OnceLock holds the index for the
//           whole `scan_content` call without making any caller pay
//           if no hit ever fires.
// TS map:   No direct equivalent. Closest pattern in TS is a lazy
//           getter that memoises into a closure variable -- TS has
//           no shared-mutable-state-with-races primitive because
//           there are no real threads.
//
// In TS you'd write (pseudocode):
// ```ts
// // Lazy memo; in single-threaded TS no synchronisation needed.
// let lineIndex: number[] | null = null;
// function getLineIndex(): number[] {
//   if (!lineIndex) lineIndex = buildLineIndex(content);
//   return lineIndex;
// }
// ```
use std::sync::OnceLock;

// What:     `use crate::rules::{is_word_byte, AcMeta, RuleSet};` imports
//           the top-level rules container, the per-AC-pattern metadata
//           tag, and the word-character classifier from the sibling
//           `rules.rs` module. `{...}` is a list import.
// Why:      `scan_content` dispatches on `AcMeta` to decide whether an
//           AC hit emits a literal-rule violation directly or queues a
//           regex rule for full evaluation; `is_word_byte` is the
//           file-side half of the conditional word-boundary check
//           (literal-side half is precomputed into `AcMeta::Literal`).
// TS map:   `import { isWordByte, AcMeta, type RuleSet } from "./rules";`.
//
// In TS you'd write (pseudocode):
// ```ts
// import { isWordByte, AcMeta, type RuleSet } from "./rules";
// ```
use crate::rules::{is_word_byte, AcMeta, ResidualShard, RuleSet};
use crate::scan_format::{
    build_line_index, end_in_line_indexed, format_hit, is_likely_binary, line_and_col_indexed,
};

// What:     `pub fn scan_content(path: &str, content: &[u8], rs: &RuleSet) -> Vec<String>`
//           scans one file's contents against the full ruleset and
//           returns an owned `Vec` of redacted hit strings. Empty Vec
//           means clean.
// Why:      Pure function (no side effects, no I/O), one file in -> one
//           Vec out. Pure shape lets callers compose it under any
//           parallel iterator without sharing mutable state.
// TS map:   `function scanContent(path: string, content: Uint8Array, rs: RuleSet): string[]`.
//
// In TS you'd write (pseudocode):
// ```ts
// function scanContent(path: string, content: Uint8Array, rs: RuleSet): string[] {
//   const hits: string[] = [];
//   if (isLikelyBinary(content)) return hits;
//   ...
//   return hits;
// }
// ```
pub fn scan_content(path: &str, content: &[u8], rs: &RuleSet) -> Vec<String> {
    let mut hits: Vec<String> = Vec::new();
    if is_likely_binary(content) {
        return hits;
    }

    // What:     `let mut prefix_matched: BTreeSet<usize> = BTreeSet::new();`
    //           accumulates indices into `rs.regex_rules` whose
    //           required-literal prefix was hit by the unified AC pass.
    //           BTreeSet dedupes (a prefix may appear many times in one
    //           file) and iterates in sorted order.
    // Why:      In the 99%-clean case this set stays empty and no
    //           resharp `find_all` runs. When the AC pass DOES fire a
    //           prefix hit, we run `find_all` exactly once per matching
    //           rule -- not once per AC hit position.
    // TS map:   `const prefixMatched = new Set<number>();`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const prefixMatched = new Set<number>();
    // ```
    let mut prefix_matched: BTreeSet<usize> = BTreeSet::new();

    // What:     `let line_index: OnceLock<Vec<usize>> = OnceLock::new();`
    //           creates an empty thread-safe one-shot cell. Calling
    //           `line_index.get_or_init(closure)` initialises the cell
    //           on first call (running `closure`), and on every later
    //           call returns the stored `&Vec<usize>` without rerunning.
    //           Concurrent racing get_or_init's on multiple rayon
    //           workers all observe the same `&Vec<usize>` afterward.
    // Why:      Build-line-index is O(file size); pay it at most once
    //           per file, only on the first hit, and share across the
    //           AC literal-emit path, the prefix-matched par_iter, and
    //           every residual-shard par_iter.
    // TS map:   No 1:1; closest is a memoised lazy getter.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // let lineIndex: number[] | null = null;
    // const getLineIndex = () => lineIndex ??= buildLineIndex(content);
    // ```
    let line_index: OnceLock<Vec<usize>> = OnceLock::new();

    // Unified AC: scans for literal rules AND required-literal prefixes
    // of regex rules in a single linear pass. AC's Standard match kind
    // exposes `find_overlapping_iter` so a longer literal at the same
    // position as a shorter regex-prefix doesn't suppress the prefix
    // hit -- without overlapping, a regex rule whose prefix coincides
    // with a literal rule's full text would never trigger.
    if let Some(ac) = &rs.ac {
        // What:     `for m in ac.find_overlapping_iter(content) { ... }`
        //           iterates EVERY (pattern, position) pair in the
        //           content where a pattern matches, regardless of
        //           overlap. `m.pattern().as_usize()` is the AC-internal
        //           id assigned at build time, used here to index
        //           `rs.ac_meta`.
        // Why:      We need both literal-rule emissions AND regex-prefix
        //           queueing to fire from the same scan pass.
        // TS map:   `for (const m of ac.findOverlappingIter(content)) { ... }`.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // for (const m of ac.findOverlappingIter(content)) {
        //   const meta = rs.acMeta[m.pattern];
        //   if (meta.kind === "literal") {
        //     hits.push(formatHit(path, ..., meta.idx));
        //   } else {
        //     prefixMatched.add(meta.rulePos);
        //   }
        // }
        // ```
        for m in ac.find_overlapping_iter(content) {
            let pid = m.pattern().as_usize();
            match &rs.ac_meta[pid] {
                AcMeta::Literal { idx, bound_left, bound_right } => {
                    // What:     Conditional word-boundary check (mirrors
                    //           `grep -w`). Each side is enforced ONLY
                    //           when the literal's edge byte is itself a
                    //           word character. The check passes when
                    //           the file context on that side is either
                    //           absent (start/end of file) or non-word.
                    //           So a short alpha-only acronym rejects a
                    //           hit when both surrounding chars are
                    //           also word chars, but a path-shaped
                    //           literal like `/etc/passwd` still matches
                    //           inside `cat /etc/passwd` because the
                    //           literal's left edge is `/` (non-word)
                    //           so no left-side boundary is enforced;
                    //           the trailing space/EOF satisfies the
                    //           right-side boundary against the `d`
                    //           edge byte.
                    //           Boundaries are pre-computed at load
                    //           time and ALSO disabled entirely when
                    //           the literal is at least
                    //           `SUBSTRING_THRESHOLD` bytes long --
                    //           long literals are distinctive enough
                    //           that coincidental substring match is
                    //           negligible (math in `rules.rs`).
                    // Why:      The original "any AC hit fires" semantics
                    //           false-positived on coincidental
                    //           substrings inside base64 blobs and
                    //           similar high-entropy noise.
                    // TS map:   `if (boundLeft && start > 0 && isWordByte(content[start - 1])) continue;` etc.
                    //
                    // In TS you'd write (pseudocode):
                    // ```ts
                    // if (boundLeft && m.start > 0 && isWordByte(content[m.start - 1])) continue;
                    // if (boundRight && m.end < content.length && isWordByte(content[m.end])) continue;
                    // ```
                    if *bound_left
                        && m.start() > 0
                        && is_word_byte(content[m.start() - 1])
                    {
                        continue;
                    }
                    if *bound_right
                        && m.end() < content.len()
                        && is_word_byte(content[m.end()])
                    {
                        continue;
                    }
                    let li = line_index.get_or_init(|| build_line_index(content));
                    let (line, col_start) = line_and_col_indexed(li, m.start());
                    let end = end_in_line_indexed(li, m.start(), m.end());
                    let (_, col_end) =
                        line_and_col_indexed(li, if end > 0 { end - 1 } else { 0 });
                    hits.push(format_hit(path, line, col_start, col_end, *idx));
                }
                AcMeta::RegexPrefix { rule_pos } => {
                    prefix_matched.insert(*rule_pos);
                }
            }
        }
    }

    // What:     `if let Some(ac_ci) = &rs.ac_ci { ... }` runs the
    //           parallel case-insensitive AC pass. Each hit is a regex-
    //           rule prefix (literal rules never live here), so we only
    //           queue rule positions; no direct literal emission.
    // Why:      Case-insensitive prefixes from `(?i)`-flagged regex
    //           rules ride this AC. Without it, those rules would fall
    //           into the residual sharded gate and serialize through
    //           a shared `Mutex<RegexInner>` per shard on every file.
    //           See PERF.md: 145 leading-(?i) betterleaks rules
    //           dominate the residual cost on this corpus.
    // TS map:   `for (const m of ac_ci.findOverlappingIter(content)) { ... }`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // if (rs.acCi) {
    //   for (const m of rs.acCi.findOverlappingIter(content)) {
    //     const meta = rs.acMetaCi[m.pattern];
    //     prefixMatched.add(meta.rulePos);
    //   }
    // }
    // ```
    if let Some(ac_ci) = &rs.ac_ci {
        for m in ac_ci.find_overlapping_iter(content) {
            let pid = m.pattern().as_usize();
            match &rs.ac_meta_ci[pid] {
                AcMeta::Literal { .. } => {
                    // unreachable: literal rules never enter the ci AC.
                    // Conservative no-op rather than panic.
                }
                AcMeta::RegexPrefix { rule_pos } => {
                    prefix_matched.insert(*rule_pos);
                }
            }
        }
    }

    // For each regex rule whose prefix fired, run its full `find_all`.
    // `prefix_matched` is small (typically 0 in 99% of files; on a
    // matching file, just the few rules whose literal prefix appeared).
    if !prefix_matched.is_empty() {
        // What:     `prefix_matched.iter().copied().collect::<Vec<usize>>()`
        //           materializes the BTreeSet into a Vec so we can
        //           parallelize over it with rayon. `copied()` turns
        //           the iterator of `&usize` into one of `usize`.
        // Why:      `BTreeSet::par_iter` exists but emits `&usize`
        //           which is harder to thread through closures than
        //           owned values; the materialize-and-par_iter pattern
        //           keeps the closure simple.
        // TS map:   `[...prefixMatched]` -- arrays parallelize via
        //           Promise.all.
        //
        // In TS you'd write (pseudocode):
        // ```ts
        // const positions = [...prefixMatched];
        // const regexHits = (await Promise.all(
        //   positions.map((pos) => scanOneRule(rs.regexRules[pos], content, path))
        // )).flat();
        // ```
        let positions: Vec<usize> = prefix_matched.iter().copied().collect();
        let regex_hits: Vec<String> = positions
            .par_iter()
            .flat_map_iter(|&pos| {
                let rr = &rs.regex_rules[pos];
                let mut local: Vec<String> = Vec::new();
                if let Ok(matches) = rr.re.find_all(content) {
                    let li = line_index.get_or_init(|| build_line_index(content));
                    for m in matches {
                        if m.start == m.end {
                            continue;
                        }
                        let (line, col_start) = line_and_col_indexed(li, m.start);
                        let end = end_in_line_indexed(li, m.start, m.end);
                        let (_, col_end) =
                            line_and_col_indexed(li, if end > 0 { end - 1 } else { 0 });
                        local.push(format_hit(path, line, col_start, col_end, rr.idx));
                    }
                }
                local
            })
            .collect();
        hits.extend(regex_hits);
    }

    // Residual bucket: regex rules whose gating substrings could NOT be
    // extracted. Sharded so each shard's combined-alternation Regex
    // stays under resharp's parse/algebra cliff (see
    // `rules.rs::build_residual_shards`). The shard variants:
    //
    // - `Single { rule_pos }`: the rule's own Regex IS the gate -- skip
    //   the redundant gate.is_match and call find_all directly on the
    //   rule's compiled Regex from `regex_rules`. find_all on a clean
    //   file is similar cost to is_match; on a matching file it's the
    //   work we'd do anyway. Net: ~half the per-file scan cost vs the
    //   original "gate.is_match then rule.find_all" pair.
    //
    // - `Combined { gate, positions }`: keep the gate.is_match short-
    //   circuit so a multi-rule shard fans out to find_all only when
    //   the gate fires, saving N-1 is_match probes.
    for shard in &rs.residual_shards {
        match shard {
            ResidualShard::Single { rule_pos } => {
                let rr = &rs.regex_rules[*rule_pos];
                if let Ok(matches) = rr.re.find_all(content) {
                    if !matches.is_empty() {
                        let li = line_index.get_or_init(|| build_line_index(content));
                        for m in matches {
                            if m.start == m.end {
                                continue;
                            }
                            let (line, col_start) = line_and_col_indexed(li, m.start);
                            let end = end_in_line_indexed(li, m.start, m.end);
                            let (_, col_end) = line_and_col_indexed(
                                li,
                                if end > 0 { end - 1 } else { 0 },
                            );
                            hits.push(format_hit(path, line, col_start, col_end, rr.idx));
                        }
                    }
                }
            }
            ResidualShard::Combined { gate, positions } => {
                if gate.is_match(content) {
                    let regex_hits: Vec<String> = positions
                        .par_iter()
                        .flat_map_iter(|&pos| {
                            let rr = &rs.regex_rules[pos];
                            let mut local: Vec<String> = Vec::new();
                            if let Ok(matches) = rr.re.find_all(content) {
                                let li = line_index.get_or_init(|| build_line_index(content));
                                for m in matches {
                                    if m.start == m.end {
                                        continue;
                                    }
                                    let (line, col_start) = line_and_col_indexed(li, m.start);
                                    let end = end_in_line_indexed(li, m.start, m.end);
                                    let (_, col_end) = line_and_col_indexed(
                                        li,
                                        if end > 0 { end - 1 } else { 0 },
                                    );
                                    local.push(format_hit(path, line, col_start, col_end, rr.idx));
                                }
                            }
                            local
                        })
                        .collect();
                    hits.extend(regex_hits);
                }
            }
        }
    }

    hits
}