forbidden-strings 0.1.2

Out-of-band scanner for forbidden literal strings and regex patterns. Gitignore-aware, fast, dependency-light: built for CI deny-listing of leaked credentials and banned tokens.
// What:     `use memchr::memchr_iter;` imports a SIMD-accelerated
//           "find every occurrence of byte B in slice S" iterator.
//           memchr is the foundation that aho-corasick is also built
//           on, so this dep is essentially free in our build.
// Why:      `build_line_index` walks every byte of one file and
//           records the offset of each `\n`. memchr_iter does that
//           with AVX2/NEON (when available) instead of byte-at-a-time
//           scalar code, so a 1M-line file builds the index in
//           milliseconds instead of tens of milliseconds.
// TS map:   No 1:1 equivalent. Closest is `String.prototype.matchAll`
//           with a `/\n/g` regex, but that is slower than SIMD memchr.
//
// In TS you'd write (pseudocode):
// ```ts
// // No equivalent. Imagine:
// // for (const m of content.matchAll(/\n/g)) starts.push(m.index + 1);
// ```
use memchr::memchr_iter;

// What:     `pub fn is_likely_binary(content: &[u8]) -> bool` takes a
//           borrowed byte slice and returns `true` if a NUL byte appears
//           in the first 8KB.
// Why:      Plain-text source code never contains NUL; binaries
//           commonly do near the start. Skipping these files avoids
//           regex cost on content with no meaningful "line:col" output
//           and prevents accidentally-tracked blobs (qcow2 fragments,
//           bundled images, lockfile-with-blob) from blowing up scan
//           time.
// TS map:   `function isLikelyBinary(content: Uint8Array): boolean`.
//
// In TS you'd write (pseudocode):
// ```ts
// function isLikelyBinary(content: Uint8Array): boolean {
//   const probeLen = Math.min(content.length, 8192);
//   for (let i = 0; i < probeLen; i++) if (content[i] === 0) return true;
//   return false;
// }
// ```
pub fn is_likely_binary(content: &[u8]) -> bool {
    let probe_len = content.len().min(8192);
    content[..probe_len].contains(&0u8)
}

// What:     `pub fn build_line_index(content: &[u8]) -> Vec<usize>`
//           produces a sorted `Vec<usize>` of byte offsets where each
//           line starts. The first entry is always `0` (line 1's start);
//           subsequent entries are the offset of the byte JUST AFTER
//           each `\n`. So a file `"abc\ndef"` yields `[0, 4]` --
//           line 1 begins at 0, line 2 begins at 4.
// Why:      Replacing the old per-hit byte walk with an O(n)-once index
//           plus O(log L) lookups (L = line count). The win matters
//           when a single file has many hits -- e.g. an agent that
//           wrote a forbidden literal a million times: 2M walks of
//           average length n/2 collapse to one O(n) build plus 2M
//           binary searches. Building only happens lazily on the
//           first hit, so 99%-clean files never pay this cost.
// TS map:   `function buildLineIndex(content: Uint8Array): number[]`.
// Gotcha:   The returned vec's length is `1 + count(\\n in content)`,
//           NOT the visible line count when the file ends without a
//           trailing newline. The last entry can equal `content.len()`
//           when the file ends with `\n`; lookups must tolerate that.
//
// In TS you'd write (pseudocode):
// ```ts
// function buildLineIndex(content: Uint8Array): number[] {
//   const starts = [0];
//   for (let i = 0; i < content.length; i++) {
//     if (content[i] === 0x0a) starts.push(i + 1);
//   }
//   return starts;
// }
// ```
pub fn build_line_index(content: &[u8]) -> Vec<usize> {
    // What:     `Vec::with_capacity(n)` pre-allocates n slots so push
    //           does not have to grow the buffer for the first n
    //           entries. We estimate n from average line length ~32.
    // Why:      Avoid quadratic copy cost on grow for very long files.
    // TS map:   No equivalent; JS arrays auto-grow with amortised O(1).
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const starts: number[] = [];
    // ```
    let mut starts: Vec<usize> = Vec::with_capacity(content.len() / 32 + 1);
    starts.push(0);
    // What:     `memchr_iter(b'\n', content)` returns a SIMD-accelerated
    //           iterator over every byte position of `\n` in `content`.
    //           `b'\n'` is a byte literal (`u8` value 10).
    // Why:      Hot loop; SIMD beats scalar by 4-8x on long inputs.
    // TS map:   No 1:1; mentally `[...content].flatMap((b, i) => b===10 ? [i] : [])`.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // for (let i = 0; i < content.length; i++) {
    //   if (content[i] === 0x0a) starts.push(i + 1);
    // }
    // ```
    for nl in memchr_iter(b'\n', content) {
        starts.push(nl + 1);
    }
    starts
}

// What:     `pub fn line_and_col_indexed(line_starts: &[usize], offset: usize) -> (usize, usize)`
//           is the indexed replacement for the old `line_and_col`. It
//           does an O(log L) binary search instead of an O(offset)
//           walk to find which line owns `offset`.
// Why:      Same `(line, col)` output as before; faster when called
//           many times on one file because the index is shared.
// TS map:   `function lineAndColIndexed(lineStarts: number[], offset: number): [number, number]`.
//
// In TS you'd write (pseudocode):
// ```ts
// function lineAndColIndexed(lineStarts: number[], offset: number): [number, number] {
//   // partition_point: first index whose value is > offset
//   let lo = 0, hi = lineStarts.length;
//   while (lo < hi) {
//     const mid = (lo + hi) >> 1;
//     if (lineStarts[mid] <= offset) lo = mid + 1; else hi = mid;
//   }
//   const lineIdx = Math.max(0, lo - 1);
//   return [lineIdx + 1, offset - lineStarts[lineIdx] + 1];
// }
// ```
pub fn line_and_col_indexed(line_starts: &[usize], offset: usize) -> (usize, usize) {
    // What:     `slice.partition_point(pred)` returns the first index
    //           where `pred` becomes false (assuming the slice is
    //           "false-then-true" partitioned by `pred`). For a sorted
    //           ascending slice and the predicate `|s| s <= offset`,
    //           this gives one past the last index whose value is
    //           `<= offset`. `saturating_sub(1)` is "subtract 1 but
    //           don't underflow `usize`" -- when the predicate is
    //           false at index 0 the result is 0 instead of wrapping.
    // Why:      Find the largest line-start that is <= offset; that
    //           line owns the byte at `offset`.
    // TS map:   See pseudocode above; TS has no `partition_point`,
    //           hand-rolled binary search needed.
    //
    // In TS you'd write (pseudocode):
    // ```ts
    // const lineIdx = Math.max(0, (lo) - 1);
    // ```
    let line_idx = line_starts
        .partition_point(|&s| s <= offset)
        .saturating_sub(1);
    let line = line_idx + 1;
    let col = offset - line_starts[line_idx] + 1;
    (line, col)
}

// What:     `pub fn end_in_line_indexed(line_starts: &[usize], start: usize, end: usize) -> usize`
//           returns the byte offset of the first `\n` in `[start, end)`
//           if one exists, else returns `end` unchanged. Indexed
//           replacement for the old `end_in_line`.
// Why:      Same semantics as before -- clamping multi-line matches
//           to one line for the report. Now O(log L) instead of
//           O(end - start).
// TS map:   `function endInLineIndexed(lineStarts: number[], start: number, end: number): number`.
//
// In TS you'd write (pseudocode):
// ```ts
// function endInLineIndexed(lineStarts: number[], start: number, end: number): number {
//   const lineIdx = Math.max(0, partitionPoint(lineStarts, s => s <= start) - 1);
//   if (lineIdx + 1 < lineStarts.length) {
//     const nextLineStart = lineStarts[lineIdx + 1];
//     if (nextLineStart > 0 && nextLineStart - 1 < end) return nextLineStart - 1;
//   }
//   return end;
// }
// ```
pub fn end_in_line_indexed(line_starts: &[usize], start: usize, end: usize) -> usize {
    let line_idx = line_starts
        .partition_point(|&s| s <= start)
        .saturating_sub(1);
    if line_idx + 1 < line_starts.len() {
        let next_line_start = line_starts[line_idx + 1];
        if next_line_start > 0 && next_line_start - 1 < end {
            return next_line_start - 1;
        }
    }
    end
}

// What:     `pub fn format_hit(path, line, col_start, col_end, rule_idx) -> String`
//           builds the redacted `path:line:col_start..col_end rule=N`
//           output string. Public so `scan.rs` can call it.
// Why:      Output format must NEVER include the matched substring --
//           the failing CI log itself is a leak surface. Centralizing
//           the format string here ensures every hit is redacted the
//           same way.
// TS map:   `function formatHit(path: string, line: number, colStart: number, colEnd: number, ruleIdx: number): string`.
//
// In TS you'd write (pseudocode):
// ```ts
// function formatHit(path, line, colStart, colEnd, ruleIdx) {
//   return `${path}:${line}:${colStart}..${colEnd} rule=${ruleIdx}`;
// }
// ```
pub fn format_hit(
    path: &str,
    line: usize,
    col_start: usize,
    col_end: usize,
    rule_idx: usize,
) -> String {
    format!("{}:{}:{}..{} rule={}", path, line, col_start, col_end, rule_idx)
}