Skip to main content

forbidden_strings/
lib.rs

1// TODO: deferred perf work. See /home/user/.claude/plans/dapper-coalescing-horizon.md.
2// TODO:   - L2: line-start index for `line_and_col` -- only matters on the
3// TODO:     violation path; revisit if a single file with many hits ever becomes
4// TODO:     a real workload.
5// TODO:   - Z1: serialize the regex-bucket combined DFA. Resharp 0.5 has no
6// TODO:     serialization API; would require swapping that gate to the `regex`
7// TODO:     crate (`regex-automata::dfa::dense::DFA::to_bytes`). Trigger: when
8// TODO:     startup-only time goes back over ~100ms after P1+P2 land.
9
10// What:     `mod walk;` declares a child module whose source lives in
11//           `walk.rs` (sibling to this file). `mod` is Rust's module
12//           system: it does NOT import names; it simply tells the
13//           compiler "this file/module exists, compile it". Names
14//           referenced via `crate::walk::xxx` afterward.
15// Why:      We split the binary into four files so each unit is
16//           focused: `walk.rs` for the working-tree walker that
17//           respects `.gitignore`.
18// TS map:   Closer to a tsconfig file's "include" entry than to an
19//           `import`. The actual `import` happens via the `use` lines
20//           below.
21// Gotcha:   `mod foo;` without a body is NOT an import; it's a
22//           registration. Forgetting to write `mod` for a sibling file
23//           silently excludes it from the build.
24//
25// In TS you'd write (pseudocode):
26// ```ts
27// // No equivalent. Closest: TypeScript automatically picks up files
28// // in `include` paths; Rust requires explicit `mod` declarations.
29// ```
30mod rules;
31mod scan;
32mod scan_format;
33mod walk;
34
35// What:     `#[cfg(feature = "fuzzing")] pub mod fuzz_api;` registers
36//           the curated re-export module ONLY when the `fuzzing`
37//           Cargo feature is on. The bin target leaves the feature
38//           off and never sees this module; fuzz targets (in
39//           `fuzz/Cargo.toml`) enable the feature and use
40//           `forbidden_strings::fuzz_api::*` to import internals.
41// Why:      Keep the production public surface unchanged while
42//           letting fuzz targets reach the internal helpers they
43//           need.
44// TS map:   `if (process.env.FUZZING) { export * as fuzzApi from "./fuzz_api"; }`
45//           in spirit; no clean 1:1 equivalent because TS has no
46//           compile-time feature gates.
47//
48// In TS you'd write (pseudocode):
49// ```ts
50// // No clean equivalent; conditional re-export at build time.
51// ```
52#[cfg(feature = "fuzzing")]
53pub mod fuzz_api;
54
55// What:     `use std::env;` imports the std `env` module so we can
56//           reference `env::args` / `env::var`.
57// Why:      Reading argv and environment variables.
58// TS map:   `import { argv, env } from "node:process";`.
59//
60// In TS you'd write (pseudocode):
61// ```ts
62// import { argv, env } from "node:process";
63// ```
64use std::env;
65
66// What:     `std::fs::canonicalize` is referenced via the full path at
67//           three sites (rules-path skip set, walker skip lookup); no
68//           bare `use std::fs;` because the per-file `fs::read` slurp
69//           moved into `read_with_binary_check` which uses
70//           `std::fs::File` directly.
71// Why:      Background on file-reading performance choices: `fs::read`
72//           is empirically faster than `mmap`-based access on this
73//           workload (many small files; per-file VMA setup cost
74//           dominates the saved alloc) -- the E2 mmap experiment
75//           regressed wall time by 35% on Mono and 43% on the Linux
76//           kernel. See PERF.md "Mmap experiment (rejected)".
77//           Thread-local scratch buffers were also tried 2026-05-03;
78//           rayon's nested-parallelism work-stealing (scan_content
79//           uses inner par_iter via prefix-matched and combined-
80//           shard fan-out) re-entered the outer flat_map_iter on
81//           the SAME thread while the buffer was borrowed,
82//           triggering a `RefCell already borrowed` panic. The
83//           per-file alloc cost is dwarfed by the unicode-mode
84//           speedup; not worth the re-entrancy hazard.
85// TS map:   N/A (no import line; `std::fs::canonicalize` is fully
86//           qualified at use sites).
87
88// What:     `use std::io::Write;` imports the `Write` TRAIT (interface-
89//           like). Methods declared by a trait are only callable when
90//           the trait is in scope, even when used via macros like
91//           `writeln!`.
92// Why:      We use `writeln!(handle, ...)` to emit hits.
93// TS map:   No 1:1 equivalent; in TS, methods are always callable.
94//
95// In TS you'd write (pseudocode):
96// ```ts
97// // Unnecessary in TS.
98// ```
99use std::io::Write;
100
101// What:     `use rayon::prelude::*;` brings rayon's parallel-iterator
102//           extension methods into scope (`par_iter`, `flat_map_iter`,
103//           etc.).
104// Why:      The two-phase main loop uses `par_iter` for both the
105//           parallel-read phase and the parallel-scan phase.
106// TS map:   No equivalent.
107//
108// In TS you'd write (pseudocode):
109// ```ts
110// // No equivalent.
111// ```
112use rayon::prelude::*;
113
114// What:     `use crate::walk::list_files;` re-exports the named function
115//           from the sibling module under a short alias for local use.
116//           `crate::` is the absolute root of this crate.
117// Why:      We call `list_files(".")` once when `--all` mode is
118//           selected to enumerate every scannable file.
119// TS map:   `import { listFiles } from "./walk";`.
120//
121// In TS you'd write (pseudocode):
122// ```ts
123// import { listFiles } from "./walk";
124// ```
125use crate::rules::load_ruleset;
126use crate::scan::scan_content;
127use crate::walk::list_files;
128
129// What:     `fn build_skip_set(rules_path: &str) -> HashSet<PathBuf>`
130//           returns the set of CANONICAL absolute paths to skip when
131//           walking the tree in `--all` mode. Pre-fix this logic was a
132//           basename check (`is_skipped_file`) that matched anywhere in
133//           the tree, so an unrelated `sub/forbidden-strings.local.txt`
134//           was silently dropped along with the actual rule file. Path-
135//           anchored matching pins each skip to its specific filesystem
136//           location.
137// Why:      Closes BUG 6 (basename skip applies to arbitrary explicit
138//           args) and BUG 11 (Windows path basename via rsplit('/')) in
139//           one shape change. Path-anchoring removes both failure modes:
140//           the basename collision cannot trigger because we compare
141//           full canonical paths, and the Windows backslash separator
142//           is handled inside `std::fs::canonicalize` / `PathBuf::eq`.
143//
144//           Skip set composition:
145//             - The actual rules file (whatever the user passed via
146//               `--rules` or `FORBIDDEN_STRINGS_RULES`; falls back to
147//               the default `forbidden-strings.local.txt` in cwd).
148//             - Four canonical self-match paths at their expected
149//               locations relative to repo root. Each file contains
150//               literal copies of rule bodies (generated source) or
151//               documented example matches (rules-engine test
152//               fixtures); scanning them in --all mode produces noise.
153//               If running from a different cwd they fail to
154//               canonicalize and are silently dropped from the set;
155//               matching is still correct for the rules file alone.
156//
157//           The caller separately decides WHEN to apply the skip:
158//           explicit positional args are NEVER skipped (the user asked
159//           for them); only walker output in --all mode is filtered.
160// TS map:   `function buildSkipSet(rulesPath: string): Set<string>`.
161//
162// In TS you'd write (pseudocode):
163// ```ts
164// function buildSkipSet(rulesPath: string): Set<string> {
165//   const set = new Set<string>();
166//   try { set.add(fs.realpathSync(rulesPath)); } catch {}
167//   for (const k of CANONICAL_SELF_MATCH_PATHS) {
168//     try { set.add(fs.realpathSync(k)); } catch {}
169//   }
170//   return set;
171// }
172// ```
173fn build_skip_set(rules_path: &str) -> std::collections::HashSet<std::path::PathBuf> {
174    // What:     `let mut set: HashSet<PathBuf> = HashSet::new();` -- the
175    //           usual mutable-empty-collection pattern.
176    // Why:      Accumulate canonical-form paths we want to skip.
177    // TS map:   `const set = new Set<string>();`.
178    //
179    // In TS you'd write (pseudocode):
180    // ```ts
181    // const set = new Set<string>();
182    // ```
183    let mut set: std::collections::HashSet<std::path::PathBuf> =
184        std::collections::HashSet::new();
185
186    // What:     `if let Ok(p) = std::fs::canonicalize(rules_path) { set.insert(p); }`.
187    //           `canonicalize` resolves symlinks AND makes the path
188    //           absolute; identical files reached via different
189    //           relative paths compare equal at the canonical level.
190    //           A missing rules file would fail to canonicalize -- the
191    //           loader will surface that error separately via
192    //           `load_ruleset`, so we silently skip the insertion here.
193    // Why:      Anchor the skip on the actual filesystem identity of
194    //           the rules file rather than its basename.
195    // TS map:   `try { set.add(fs.realpathSync(rulesPath)); } catch {}`.
196    //
197    // In TS you'd write (pseudocode):
198    // ```ts
199    // try { set.add(fs.realpathSync(rulesPath)); } catch {}
200    // ```
201    if let Ok(p) = std::fs::canonicalize(rules_path) {
202        set.insert(p);
203    }
204    // What:     Canonical self-match paths relative to the repo root.
205    //           Each is a file we know contains literal copies of rule
206    //           bodies (generated source) or documented example match
207    //           strings (rules-engine test fixtures); scanning them in
208    //           --all mode would produce self-matches. Pinned by their
209    //           expected location so the matcher does not fire on
210    //           unrelated files of the same name elsewhere in the tree.
211    // Why:      Same anti-self-match guard as the previous basename
212    //           list, but anchored to specific paths. If the binary is
213    //           run from outside the monorepo or these files have been
214    //           relocated, canonicalize fails and the entry is dropped
215    //           -- still no false negative because the file does not
216    //           exist at the expected location, so the walker would
217    //           not encounter it either.
218    // TS map:   constant string array of canonical paths.
219    //
220    // In TS you'd write (pseudocode):
221    // ```ts
222    // const CANONICAL_SELF_MATCH_PATHS = [ "...", "...", "...", "..." ];
223    // ```
224    let canonical_self_match_paths = [
225        "packages/cli/forbidden-strings/data/betterleaks-default-config.toml",
226        "packages/cli/forbidden-strings/src/port-betterleaks-relaxations.ts",
227        "forbidden-strings.local.example.txt",
228        "packages/cli/forbidden-strings/src/rules/algebra_tests.rs",
229    ];
230    for k in canonical_self_match_paths {
231        if let Ok(p) = std::fs::canonicalize(k) {
232            set.insert(p);
233        }
234    }
235    set
236}
237
238// What:     `fn is_walker_skipped(path: &str, skip_set: &HashSet<PathBuf>) -> bool`
239//           returns true when the path's canonical form matches a
240//           skip-set entry. Used ONLY for walker output in --all mode;
241//           explicit positional args bypass this check entirely.
242// Why:      Closes BUG 6: the previous `is_skipped_file` ran on every
243//           queued path regardless of source, hiding real positive
244//           findings on `sub/forbidden-strings.local.txt`-style explicit
245//           args. The path-anchored form here is consulted only when
246//           the caller knows the path came from the walker.
247// TS map:   `function isWalkerSkipped(path: string, skipSet: Set<string>): boolean`.
248//
249// In TS you'd write (pseudocode):
250// ```ts
251// function isWalkerSkipped(path: string, skipSet: Set<string>): boolean {
252//   try {
253//     const canonical = fs.realpathSync(path);
254//     return skipSet.has(canonical);
255//   } catch { return false; }
256// }
257// ```
258fn is_walker_skipped(
259    path: &str,
260    skip_set: &std::collections::HashSet<std::path::PathBuf>,
261) -> bool {
262    // What:     Canonicalize per file and lookup in the skip set. A
263    //           canonicalize failure (broken symlink, vanished file)
264    //           returns false -- if we cannot resolve the path, we are
265    //           definitely not skipping it. The downstream `fs::read`
266    //           will surface any read error via the BUG 4 fix.
267    // Why:      Per-file canonicalize is one stat syscall; with the
268    //           ~2700-file walked corpus, that's a few ms total --
269    //           well under the scan cost itself.
270    // TS map:   try/catch around realpathSync.
271    //
272    // In TS you'd write (pseudocode):
273    // ```ts
274    // try {
275    //   const canonical = fs.realpathSync(path);
276    //   return skipSet.has(canonical);
277    // } catch { return false; }
278    // ```
279    if let Ok(canonical) = std::fs::canonicalize(path) {
280        return skip_set.contains(&canonical);
281    }
282    false
283}
284
285// What:     `fn is_config_file_at_cwd(path, cwd_canonical) -> bool` returns
286//           true when `path` names a `forbidden-strings.*.txt` file sitting
287//           DIRECTLY in the current working directory. The basename check is
288//           cheap and runs first; only on a name match do we canonicalize to
289//           confirm the file's parent IS the cwd, so a same-named file in a
290//           subdirectory is not matched.
291// Why:      These are the scanner's own ruleset files
292//           (`forbidden-strings.local.txt`, `.local.example.txt`,
293//           `.append.txt`, `.append.local.txt`). Scanning them re-derives the
294//           rule bodies as self-matches, so they are always skipped, in BOTH
295//           `--all` walker mode and explicit-positional-arg mode. Unlike the
296//           `--all`-only `build_skip_set` guard, this one also fires on
297//           explicit args, because CI passes changed files positionally
298//           (`forbidden-strings --rules ... <changed>...`) and an edited
299//           `forbidden-strings.append.txt` would otherwise self-match.
300//
301//           The cwd anchor keeps BUG 6 / BUG 11 closed: a file like
302//           `sub/forbidden-strings.local.txt` (different parent) still scans,
303//           because only a config file at the cwd root is skipped.
304// TS map:   `function isConfigFileAtCwd(path: string, cwdCanonical?: string): boolean`.
305fn is_config_file_at_cwd(
306    path: &str,
307    cwd_canonical: Option<&std::path::Path>,
308) -> bool {
309    let name_matches = std::path::Path::new(path)
310        .file_name()
311        .and_then(|n| n.to_str())
312        .is_some_and(|name| {
313            name.starts_with("forbidden-strings.") && name.ends_with(".txt")
314        });
315    if !name_matches {
316        return false;
317    }
318    let Some(cwd) = cwd_canonical else {
319        return false;
320    };
321    let Ok(canonical) = std::fs::canonicalize(path) else {
322        return false;
323    };
324    canonical.parent() == Some(cwd)
325}
326
327// What:     `BIN_PROBE_SIZE` is the byte length read up-front from every
328//           file before deciding whether the file is binary. 8 KiB is
329//           the same probe size the pre-BUG-5 `is_likely_binary`
330//           heuristic used; it matches `git diff`'s "binary or text"
331//           heuristic threshold.
332// Why:      The probe length tunes a tradeoff: smaller probe lets a
333//           binary file with a leading text header (PDF header,
334//           machine-O header) sneak past as text; larger probe wastes
335//           memory on small files. 8 KiB catches the common cases
336//           (PNG, JPG, ELF, WASM, zip, ZSTD frames) and is the
337//           established convention.
338const BIN_PROBE_SIZE: usize = 8192;
339
340// What:     `read_with_binary_check(path)` reads a file under a binary
341//           heuristic:
342//             1. Always read the first `BIN_PROBE_SIZE` bytes.
343//             2. If the file is smaller than that, return what we got.
344//             3. If the probe contains a NUL byte and the file is
345//                larger than the probe, return only the probe (the
346//                rest is treated as binary tail and not scanned).
347//             4. Otherwise (probe is NUL-free), read and return the
348//                full file.
349// Why:      Closes the BUG-5 regression without re-introducing the
350//           soundness gap that BUG 5 fixed. BUG 5 removed a heuristic
351//           that threw away the WHOLE file when the first 8 KiB
352//           contained a NUL byte; that masked secrets sitting BEFORE
353//           the NUL. This rule keeps that signal (the first 8 KiB is
354//           always scanned), but caps the per-file work on large
355//           binary blobs (firmware images, vmlinuz, font caches, lock
356//           sidecars) at 8 KiB instead of full content. Acceptable
357//           miss: a secret living AFTER a NUL byte in a file that is
358//           ALSO larger than 8 KiB. Acceptable: those files are the
359//           "binary blob with bytes that happen to spell a secret"
360//           case, and the secret-leak risk is dominated by source
361//           files and small lock files which still scan in full.
362// TS map:   `function readWithBinaryCheck(path: string): Buffer`.
363//
364// In TS you'd write (pseudocode):
365// ```ts
366// function readWithBinaryCheck(path: string): Buffer {
367//   const fd = fs.openSync(path, "r");
368//   try {
369//     const probe = Buffer.alloc(BIN_PROBE_SIZE);
370//     const n = fs.readSync(fd, probe, 0, BIN_PROBE_SIZE, null);
371//     if (n < BIN_PROBE_SIZE) return probe.subarray(0, n);
372//     if (probe.indexOf(0) !== -1) return probe;
373//     return Buffer.concat([probe, fs.readSync.readRestOf(fd)]);
374//   } finally {
375//     fs.closeSync(fd);
376//   }
377// }
378// ```
379fn read_with_binary_check(path: &str) -> Result<Vec<u8>, std::io::Error> {
380    use std::fs::File;
381    use std::io::Read;
382
383    let mut file = File::open(path)?;
384    let mut buf: Vec<u8> = Vec::with_capacity(BIN_PROBE_SIZE);
385    (&mut file)
386        .take(BIN_PROBE_SIZE as u64)
387        .read_to_end(&mut buf)?;
388
389    if buf.len() < BIN_PROBE_SIZE {
390        return Ok(buf);
391    }
392
393    if memchr::memchr(0, &buf).is_some() {
394        return Ok(buf);
395    }
396
397    file.read_to_end(&mut buf)?;
398    Ok(buf)
399}
400
401// What:     `pub fn run_cli_from_env() -> Result<i32, String>` is the
402//           library entry point. It reads `env::args()` and env vars,
403//           parses flags, loads the ruleset, runs the parallel scan,
404//           prints hits to stderr, and returns the exit code the OS
405//           should see. `Result<i32, String>` lets the binary thin
406//           wrapper decide how to report a catastrophic failure (the
407//           `Err` arm) versus a regular run (`Ok(0)` clean,
408//           `Ok(1)` violation, `Ok(2)` usage error already eprinted).
409//           Sibling shape considered: returning `ExitCode` directly --
410//           rejected because tests written against the lib want a
411//           plain `i32` they can compare, and `ExitCode` has no `Eq`.
412// Why:      Coordinate arg parsing, ruleset loading, parallel scan,
413//           and result reporting from a unit testable surface. The bin
414//           target's `main` is now a five-line wrapper that turns the
415//           returned code into an `ExitCode` and prints `Err` to
416//           stderr with a fixed prefix.
417// TS map:   No entry-point function in TS; Node scripts just run top-
418//           to-bottom. Mentally picture an
419//           `async function runCliFromEnv(): Promise<number>` that
420//           the bin's tiny wrapper awaits and passes to
421//           `process.exit`.
422//
423// In TS you'd write (pseudocode):
424// ```ts
425// async function runCliFromEnv(): Promise<number> {
426//   // ...
427//   return anyViolation ? 1 : 0;
428// }
429// process.exit(await runCliFromEnv());
430// ```
431pub fn run_cli_from_env() -> Result<i32, String> {
432    // What:     `let args: Vec<String> = env::args().skip(1).collect();`
433    //           reads command-line arguments. `env::args()` returns an
434    //           iterator of `String`s where index 0 is the program name
435    //           ("forbidden-strings"); `.skip(1)` drops it; `.collect()`
436    //           materializes the remainder into a `Vec<String>`. The
437    //           explicit `Vec<String>` annotation tells `.collect()` what
438    //           container to build (without it, the collect call is
439    //           ambiguous). Sibling type to consider: `Vec<&str>` would
440    //           BORROW the args, but `env::args()` already yields owned
441    //           `String`s -- borrowing is not an option here.
442    // Why:      We need the user's actual flags/files; the program name
443    //           is irrelevant.
444    // TS map:   `const args = process.argv.slice(2);`.
445    //
446    // In TS you'd write (pseudocode):
447    // ```ts
448    // const args: string[] = process.argv.slice(2);
449    // ```
450    let args: Vec<String> = env::args().skip(1).collect();
451
452    // What:     `let mut rules_path: Option<String> = env::var("...").ok();`
453    //           reads an environment variable. `env::var` returns
454    //           `Result<String, VarError>` (Err if unset); `.ok()` converts
455    //           it into `Option<String>` -- `Some(value)` if set, `None`
456    //           otherwise. The `mut` lets us reassign `rules_path` later if
457    //           `--rules` overrides. Sibling type: `Option<&str>` would
458    //           need the env value to live somewhere else; `String` is
459    //           owned so it can outlive any function call.
460    // Why:      Initial source for the rules-file path; `--rules` flag
461    //           takes precedence and overwrites this.
462    // TS map:   `let rulesPath: string | undefined = process.env.FORBIDDEN_STRINGS_RULES;`.
463    //
464    // In TS you'd write (pseudocode):
465    // ```ts
466    // let rulesPath: string | undefined = process.env.FORBIDDEN_STRINGS_RULES;
467    // ```
468    let mut rules_path: Option<String> = env::var("FORBIDDEN_STRINGS_RULES").ok();
469
470    // What:     `let mut all = false;` declares a mutable boolean. No
471    //           type annotation needed -- the literal `false` infers `bool`.
472    // Why:      Tracks whether `--all` was passed; we toggle it to true
473    //           when we encounter the flag.
474    // TS map:   `let all = false;`.
475    //
476    // In TS you'd write (pseudocode):
477    // ```ts
478    // let all = false;
479    // ```
480    let mut all = false;
481
482    // What:     `let mut files: Vec<String> = Vec::new();` allocates an
483    //           empty growable, owned vector of `String`. `Vec::new()` is
484    //           the empty-vector constructor; the explicit type annotation
485    //           tells the compiler the element type since the empty
486    //           constructor cannot infer it. Sibling: `Vec<&str>` cannot
487    //           hold values that outlive the source; we want owned data.
488    // Why:      Accumulates positional file arguments as we parse argv.
489    // TS map:   `const files: string[] = [];`.
490    //
491    // In TS you'd write (pseudocode):
492    // ```ts
493    // const files: string[] = [];
494    // ```
495    let mut files: Vec<String> = Vec::new();
496
497    // What:     `let mut i: usize = 0;` declares a mutable index counter.
498    //           `usize` is the unsigned integer wide enough to address any
499    //           byte in memory on this platform (32 bits on 32-bit OS,
500    //           64 bits on 64-bit OS). Siblings the reader might expect:
501    //           `u32`, `u64`, `i32`, `i64`. Why `usize` not `u64`? Every
502    //           std API that takes a "size" or "index" wants `usize`;
503    //           mixing widths forces casts.
504    // Why:      Manual index lets us advance by 2 (consume `--rules` plus
505    //           its value) inside the loop body.
506    // TS map:   `let i = 0;` (TS has only one number type).
507    //
508    // In TS you'd write (pseudocode):
509    // ```ts
510    // let i = 0;
511    // ```
512    let mut i: usize = 0;
513
514    // What:     `while i < args.len() { ... }` is a basic conditional loop.
515    //           No iterator, no syntactic sugar -- just "keep going while
516    //           condition holds". `args.len()` returns the vector's length
517    //           as `usize`.
518    // Why:      We need manual index control to consume `--rules` plus
519    //           its argument together; a `for arg in &args` loop cannot
520    //           skip ahead.
521    // TS map:   `while (i < args.length) { ... }`.
522    //
523    // In TS you'd write (pseudocode):
524    // ```ts
525    // while (i < args.length) { ... }
526    // ```
527    while i < args.len() {
528        // What:     `let a = &args[i];` borrows the i-th element. `&` is
529        //           Rust's "borrow" operator: it gives a read-only
530        //           reference to the value without taking ownership; the
531        //           original vector still owns the `String`. Without `&`,
532        //           Rust would try to MOVE the `String` out of the vector,
533        //           which is illegal because `Vec<String>` does not
534        //           support hole-poking moves.
535        // Why:      We want to inspect the arg's contents (compare to
536        //           "--rules", etc.) without consuming it.
537        // TS map:   `const a = args[i];` -- TS has no ownership system,
538        //           so reading is always implicitly "borrowing".
539        //
540        // In TS you'd write (pseudocode):
541        // ```ts
542        // const a = args[i];
543        // ```
544        let a = &args[i];
545        if a == "--rules" {
546            i += 1;
547            if i >= args.len() {
548                eprintln!("--rules needs an argument");
549                // What:     `return Ok(2);` early-exits `run_cli_from_env`
550                //           with the eventual OS exit code 2. `Ok(...)`
551                //           wraps the `i32` into the success variant of
552                //           `Result<i32, String>`; the bin wrapper turns
553                //           it into `ExitCode::from(2)`.
554                // Why:      Convention: 0 = success, 1 = violation,
555                //           2 = usage / config error. The usage message
556                //           was already printed on the previous line.
557                // TS map:   `return 2;`.
558                //
559                // In TS you'd write (pseudocode):
560                // ```ts
561                // return 2;
562                // ```
563                return Ok(2);
564            }
565            // What:     `rules_path = Some(args[i].clone());` reassigns
566            //           the `Option<String>` variable. `Some(...)` wraps
567            //           a value into the present variant of `Option`;
568            //           `args[i].clone()` deep-copies the indexed `String`
569            //           so the assignment OWNS its bytes (we cannot move
570            //           out of a Vec, and a borrow would tie `rules_path`
571            //           to `args`'s lifetime).
572            // Why:      Capture the argument that follows `--rules` as
573            //           our authoritative rules path.
574            // TS map:   `rulesPath = args[i];` -- TS strings are GC'd, no
575            //           clone needed.
576            //
577            // In TS you'd write (pseudocode):
578            // ```ts
579            // rulesPath = args[i];
580            // ```
581            rules_path = Some(args[i].clone());
582        } else if a == "--all" {
583            all = true;
584        } else if a == "--help" || a == "-h" {
585            // What:     `concat!` is a compile-time macro joining string
586            //           literals into a single `&'static str`. The `!`
587            //           marks it as a macro call, not a function call.
588            //           `env!("CARGO_PKG_VERSION")` reads `version` from
589            //           Cargo.toml at compile time and inlines it as a
590            //           string literal.
591            // Why:      Print a single static help string with the version
592            //           baked in, no runtime allocation, no formatter.
593            // TS map:   The TS analogue is template-literal concatenation
594            //           plus `process.env.npm_package_version` (read at
595            //           build time via a bundler define), but TS has no
596            //           macro system -- the closest mental model is
597            //           "compiled-in string template".
598            //
599            // In TS you'd write (pseudocode):
600            // ```ts
601            // const VERSION = process.env.npm_package_version!;
602            // const HELP = `forbidden-strings ${VERSION}\n...`;
603            // console.log(HELP);
604            // ```
605            println!(
606                "{}",
607                concat!(
608                    "forbidden-strings ", env!("CARGO_PKG_VERSION"), "\n",
609                    "Linear-time deny-list scanner for Git repos.\n",
610                    "\n",
611                    "USAGE:\n",
612                    "    forbidden-strings [--rules <PATH>] [--all] [FILE...]\n",
613                    "\n",
614                    "FLAGS:\n",
615                    "    --rules <PATH>    Path to the rule file (one rule per line).\n",
616                    "                      Overrides FORBIDDEN_STRINGS_RULES.\n",
617                    "                      Default: ./forbidden-strings.local.txt\n",
618                    "    --all             Scan every git-tracked file under cwd.\n",
619                    "                      Respects .gitignore (via the `ignore` crate).\n",
620                    "    -h, --help        Print this help and exit.\n",
621                    "    -V, --version     Print version and exit.\n",
622                    "\n",
623                    "ENV:\n",
624                    "    FORBIDDEN_STRINGS_RULES    Default rules path; --rules wins if both are set.\n",
625                    "                               If unset, falls back to ./forbidden-strings.local.txt\n",
626                    "\n",
627                    "EXIT CODES:\n",
628                    "    0    No violations.\n",
629                    "    1    One or more violations (printed to stderr, redacted).\n",
630                    "    2    Usage error or rule-file error.\n",
631                    "\n",
632                    "EXAMPLES:\n",
633                    "    # Scan a few files\n",
634                    "    forbidden-strings --rules ./rules.txt src/main.ts README.md\n",
635                    "\n",
636                    "    # Scan the whole working tree\n",
637                    "    FORBIDDEN_STRINGS_RULES=./rules.txt forbidden-strings --all\n",
638                    "\n",
639                    "RULE FORMAT:\n",
640                    "    Bare line              -> case-sensitive literal substring\n",
641                    "    /PATTERN/FLAGS         -> regex (resharp; supports A&B, ~(A), (?=...), (?<=...))\n",
642                    "    # ...                  -> comment\n",
643                    "    Empty line             -> skipped\n",
644                    "\n",
645                    "RESHARP LIMITATIONS (0.5.x through 0.6.x):\n",
646                    "    A `~(...)` complement body cannot contain `\\b`, `\\B`, `^`, `$`,\n",
647                    "    or any user-explicit lookaround. Use `\\W` or literal whitespace for\n",
648                    "    `\\b`; `\\A`/`\\z` for `^`/`$` when whole-content semantics fit; or\n",
649                    "    lift the boundary check outside the complement. Loader rejects every\n",
650                    "    failing shape with a named-trigger error. See docs/troubleshooting/resharp.md.\n",
651                    "\n",
652                    "OUTPUT:\n",
653                    "    PATH:LINE:COL_START..COL_END rule=N    (matched substring is NEVER printed)\n",
654                    "\n",
655                    "See README.md for set-algebra rule examples and CI integration.\n",
656                ),
657            );
658            return Ok(0);
659        } else if a == "--version" || a == "-V" {
660            // What:     Same `concat!` + `env!` trick: compile-time string
661            //           literal, no runtime cost. `env!` panics at compile
662            //           time if `CARGO_PKG_VERSION` is unset, which is
663            //           impossible inside a Cargo build.
664            // Why:      Match `cargo`/`rustc` convention -- `--version`
665            //           prints `<name> <semver>` on stdout.
666            // TS map:   `console.log(`forbidden-strings ${VERSION}`)`.
667            //
668            // In TS you'd write (pseudocode):
669            // ```ts
670            // console.log(`forbidden-strings ${VERSION}`);
671            // ```
672            println!("forbidden-strings {}", env!("CARGO_PKG_VERSION"));
673            return Ok(0);
674        } else if a.starts_with("--") || a.starts_with("-") && a.len() > 1 {
675            eprintln!("unknown flag {}", a);
676            return Ok(2);
677        } else {
678            // What:     `files.push(a.clone())`. `a` is a `&String`
679            //           (borrowed); `.clone()` deep-copies the `String`
680            //           so the new owned copy can be moved into the
681            //           vector. We cannot push the borrow itself --
682            //           `Vec<String>` requires owned `String`s and the
683            //           borrow's lifetime would not outlive `args`.
684            // Why:      Stash the positional file argument for later
685            //           scanning.
686            // TS map:   `files.push(a);` -- TS strings are GC'd; no clone.
687            //
688            // In TS you'd write (pseudocode):
689            // ```ts
690            // files.push(a);
691            // ```
692            files.push(a.clone());
693        }
694        // What:     `i += 1;` advances to the next argv slot. Plain
695        //           integer increment; no Rust-specific magic.
696        // Why:      Move past the just-consumed flag/value.
697        // TS map:   `i += 1;`.
698        //
699        // In TS you'd write (pseudocode):
700        // ```ts
701        // i += 1;
702        // ```
703        i += 1;
704    }
705
706    // What:     `unwrap_or_else(|| ...)` returns the inner `Some` value or
707    //           runs the closure to produce a fallback. The closure body
708    //           is a string literal converted to `String` via `.to_string()`.
709    // Why:      Default the rules path to `forbidden-strings.local.txt` in
710    //           cwd when neither `--rules` nor `FORBIDDEN_STRINGS_RULES`
711    //           is set, matching the conventional filename. The loader
712    //           emits a clear "file not found" error if the default
713    //           doesn't exist; we don't pre-check and shadow that error.
714    // TS map:   `rulesPath ?? "forbidden-strings.local.txt"`.
715    //
716    // In TS you'd write (pseudocode):
717    // ```ts
718    // const finalRulesPath = rulesPath ?? "forbidden-strings.local.txt";
719    // ```
720    let rules_path = rules_path.unwrap_or_else(|| "forbidden-strings.local.txt".to_string());
721
722    // Run `load_ruleset` and `list_files` concurrently when --all is
723    // set: rules loading is CPU-bound (regex compile + AC build);
724    // file walking is I/O-bound (directory traversal + gitignore parse).
725    // They share no state, so overlapping them shaves whichever side
726    // is shorter.
727    // What:     `rayon::join(|| f1(), || f2())` runs two closures in
728    //           parallel using the rayon threadpool. Returns a tuple
729    //           of their return values once both finish. If only one
730    //           closure has substantial work (e.g. when --all is off,
731    //           we have no file walk to do), join still runs both --
732    //           but the empty closure adds negligible cost.
733    // Why:      Rules load is ~12ms for a 1k-rule ruleset; file walk
734    //           is ~7ms on this repo. Sequential = 19ms; parallel = 12ms.
735    // TS map:   `await Promise.all([loadRuleset(rulesPath), listFiles(".")])`.
736    //
737    // In TS you'd write (pseudocode):
738    // ```ts
739    // const [rulesetResult, filesResult] = await Promise.all([
740    //   loadRuleset(rulesPath),
741    //   all ? listFiles(".") : Promise.resolve(null),
742    // ]);
743    // ```
744    let (ruleset_result, listed_result): (Result<_, String>, Option<Result<Vec<String>, String>>) =
745        rayon::join(
746            || load_ruleset(&rules_path),
747            || if all { Some(list_files(".")) } else { None },
748        );
749
750    // What:     `let ruleset = match ruleset_result { Ok(r) => r, Err(e) => { ...; return ... } };`
751    //           is a `match` expression destructuring a `Result<RuleSet, String>`.
752    //           `Ok(r)` binds the success payload to local `r` and
753    //           "evaluates" the arm to that value; `Err(e)` binds the
754    //           failure payload, prints it, and early-returns from
755    //           `main`. The match expression as a whole evaluates to
756    //           the `Ok` arm's value; assigning it to `ruleset` gives us
757    //           a plain `RuleSet` to use below (no more wrapper).
758    // Why:      Unwrap the `Result` while presenting a friendly error to
759    //           the user instead of a panic.
760    // TS map:   `try { ruleset = await loadRuleset(...); } catch (e) { console.error(...); process.exit(2); }`.
761    //
762    // In TS you'd write (pseudocode):
763    // ```ts
764    // let ruleset: RuleSet;
765    // try { ruleset = rulesetResult; }
766    // catch (e) { console.error(`forbidden-strings: ${e}`); process.exit(2); }
767    // ```
768    let ruleset = match ruleset_result {
769        Ok(r) => r,
770        Err(e) => {
771            eprintln!("forbidden-strings: {}", e);
772            return Ok(2);
773        }
774    };
775
776    if env::var("FORBIDDEN_STRINGS_DEBUG_BUCKETS").is_ok() {
777        let ac_cs_pat = ruleset.ac_meta.iter().filter(|m| matches!(m, crate::rules::AcMeta::RegexPrefix { .. })).count();
778        let ac_cs_lit = ruleset.ac_meta.iter().filter(|m| matches!(m, crate::rules::AcMeta::Literal { .. })).count();
779        let ac_ci_pat = ruleset.ac_meta_ci.len();
780        let residual_count: usize = ruleset.residual_shards.iter().map(|s| match s {
781            crate::rules::ResidualShard::Single { .. } => 1,
782            crate::rules::ResidualShard::Combined { positions, .. } => positions.len(),
783        }).sum();
784        let single_shard_count = ruleset.residual_shards.iter().filter(|s| matches!(s, crate::rules::ResidualShard::Single { .. })).count();
785        let combined_shard_count = ruleset.residual_shards.len() - single_shard_count;
786        eprintln!(
787            "forbidden-strings buckets: ac_cs_lit={} ac_cs_regex_prefix={} ac_ci_regex_prefix={} residual={} (in {} single + {} combined shards) regex_rules_total={}",
788            ac_cs_lit, ac_cs_pat, ac_ci_pat, residual_count, single_shard_count, combined_shard_count, ruleset.regex_rules.len(),
789        );
790        if env::var("FORBIDDEN_STRINGS_DEBUG_RESIDUAL_LIST").is_ok() {
791            for shard in &ruleset.residual_shards {
792                let positions: Vec<usize> = match shard {
793                    crate::rules::ResidualShard::Single { rule_pos } => vec![*rule_pos],
794                    crate::rules::ResidualShard::Combined { positions, .. } => positions.clone(),
795                };
796                for pos in positions {
797                    let r = &ruleset.regex_rules[pos];
798                    eprintln!("residual rule line={}", r.idx);
799                }
800            }
801        }
802    }
803
804    // What:     `if let Some(listed) = listed_result { match listed { ... } }`.
805    //           One-arm pattern match: enter the block ONLY when
806    //           `listed_result` is `Some`, binding the inner
807    //           `Result<Vec<String>, String>` to `listed`. Inside, a
808    //           regular `match` extracts `Ok` (replace `files` with the
809    //           walker's output) or `Err` (print, exit 2).
810    // Why:      `listed_result` is `Some(...)` only when `--all` was
811    //           passed; otherwise `None` and we skip silently, leaving
812    //           `files` set to whatever came from positional args.
813    // TS map:   `if (listedResult) { try { files = listedResult; } catch (e) { ... } }`.
814    //
815    // In TS you'd write (pseudocode):
816    // ```ts
817    // if (listedResult !== null) {
818    //   try { files = listedResult; }
819    //   catch (e) { console.error(`forbidden-strings: ${e}`); process.exit(2); }
820    // }
821    // ```
822    if let Some(listed) = listed_result {
823        match listed {
824            Ok(f) => files = f,
825            Err(e) => {
826                eprintln!("forbidden-strings: {}", e);
827                return Ok(2);
828            }
829        }
830    }
831
832    // Fused read+scan: each rayon thread maps one file's bytes
833    // (via mmap; falls back to `fs::read` if mmap fails) and
834    // immediately scans them. The two-phase split that used to live
835    // here (Phase A reads, Phase B scans) traded cache locality for
836    // a clean separation but produced no measurable speedup -- after
837    // P1 the AC scan is so fast that file bytes go from disk to AC to
838    // discard within tens of microseconds. Fusing keeps each file's
839    // bytes hot in L1/L2 across the read->scan boundary instead of
840    // risking eviction during the materialize-then-iterate round trip.
841    // What:     `files.par_iter().flat_map_iter(|p| { try mmap(p); scan_content(p, &bytes, &rs) }).collect::<Vec<String>>()`
842    //           runs map+scan as one rayon work unit per file. The
843    //           closure's `Mmap` (or `Vec<u8>` fallback) lives only
844    //           until the scan finishes for that file; rayon
845    //           work-steals across cores.
846    // Why:      Mmap saves the alloc + memcpy that `fs::read` does.
847    //           On a hot page cache, that's measurable on `--all`;
848    //           on a cold cache, MADV_SEQUENTIAL lets the kernel
849    //           readahead-pipeline files. Fallback to `fs::read`
850    //           handles the cases mmap can't (empty files, /proc
851    //           entries, character devices).
852    // TS map:   `(await Promise.all(files.map(async (p) => scanContent(p, await readFileFastest(p), rs)))).flat()`.
853    //
854    // In TS you'd write (pseudocode):
855    // ```ts
856    // const hits = (await Promise.all(
857    //   files.map(async (p) => scanContent(p, await readFileFastest(p), ruleset))
858    // )).flat();
859    // ```
860    // What:     Build the canonical-path skip set once at startup
861    //           (rather than per-file). The set captures the actual
862    //           rules file plus the canonical generated-source paths;
863    //           empty when none of them resolve. Used only in --all
864    //           mode to filter walker output.
865    // Why:      Closes BUG 6: explicit positional args are never
866    //           skipped; only walker output is filtered, and the filter
867    //           is path-anchored (not basename-anchored), so
868    //           `sub/forbidden-strings.local.txt` no longer collides
869    //           with the actual rules file path.
870    // TS map:   `const skipSet = buildSkipSet(rulesPath);`.
871    //
872    // In TS you'd write (pseudocode):
873    // ```ts
874    // const skipSet = buildSkipSet(rulesPath);
875    // ```
876    let skip_set = if all { build_skip_set(&rules_path) } else { std::collections::HashSet::new() };
877
878    // What:     Canonical cwd, resolved once. `is_config_file_at_cwd`
879    //           compares each candidate's canonical parent against this to
880    //           skip the scanner's own `forbidden-strings.*.txt` ruleset
881    //           files at the repo root, in both --all and explicit-arg modes.
882    // Why:      Resolve symlinks once here rather than per file.
883    let cwd_canonical = std::fs::canonicalize(".").ok();
884
885    let hits: Vec<String> = files
886        .par_iter()
887        .flat_map_iter(|p| {
888            // Always skip the scanner's own ruleset files at cwd
889            // (forbidden-strings.*.txt), regardless of --all vs explicit
890            // args: they hold literal rule bodies that self-match.
891            if is_config_file_at_cwd(p, cwd_canonical.as_deref()) {
892                return Vec::new();
893            }
894            // What:     `if all && is_walker_skipped(p, &skip_set) { return Vec::new(); }`.
895            //           Only runs the skip check on walker output
896            //           (--all mode). For explicit positional args
897            //           (`forbidden-strings <path>...` without --all),
898            //           the file is ALWAYS scanned -- the user asked.
899            // Why:      Closes BUG 6: the previous basename-based skip
900            //           hid real findings on
901            //           `sub/forbidden-strings.local.txt` and friends
902            //           passed as explicit args. The new check applies
903            //           only when the walker discovered the file
904            //           automatically.
905            //
906            //           Inside the conditional, `is_walker_skipped`
907            //           canonicalizes the path and compares against
908            //           the pre-built skip set. Path-anchored matching
909            //           also closes BUG 11 (Windows backslash basename)
910            //           by routing through `std::fs::canonicalize`.
911            // TS map:   `if (all && isWalkerSkipped(p, skipSet)) return [];`.
912            //
913            // In TS you'd write (pseudocode):
914            // ```ts
915            // if (all && isWalkerSkipped(p, skipSet)) return [];
916            // ```
917            if all && is_walker_skipped(p, &skip_set) {
918                return Vec::new();
919            }
920            // What:     `let content = fs::read(p).unwrap_or_default();`.
921            //           `fs::read` returns `Result<Vec<u8>, io::Error>`
922            //           (the file's raw bytes or an I/O error).
923            //           `.unwrap_or_default()` extracts the `Ok` value or
924            //           substitutes `Vec::<u8>::default()` (the empty
925            //           vec) and SILENTLY DROPS the error. The implicit
926            //           inferred type is `Vec<u8>`. Sibling pattern:
927            //           `fs::read_to_string` returns `Result<String, _>`
928            //           but requires UTF-8 -- we want raw bytes here
929            //           because rules scan binary files too.
930            // Why:      A file we can't read (permissions, vanished,
931            //           etc.) becomes "empty content" and the scan
932            //           pass produces zero hits for it. Crashing the
933            //           whole walk on one unreadable file is worse.
934            // TS map:   `try { content = await readFile(p); } catch { content = new Uint8Array(); }`.
935            // Gotcha:   `.unwrap_or_default()` SILENTLY discards the
936            //           `io::Error`. We accept that here because the
937            //           per-file scan is best-effort.
938            //
939            // In TS you'd write (pseudocode):
940            // ```ts
941            // let content: Uint8Array;
942            // try { content = await readFile(p); }
943            // catch (e) { return [`${p}: read error: ${e.message}`]; }
944            // return scanContent(p, content, ruleset);
945            // ```
946            // What:     `match fs::read(p) { Ok(c) => ..., Err(e) => ... }`.
947            //           Read error path now emits a synthetic "hit"
948            //           string formatted as `{path}: read error: {err}`
949            //           instead of silently substituting empty content.
950            //           The synthetic entry makes the file appear in the
951            //           output report AND keeps the exit code at 1 (hits
952            //           non-empty -> ExitCode::from(1) downstream).
953            // Why:      Closes BUG 4. Pre-fix, `fs::read(p).unwrap_or_default()`
954            //           dropped every io::Error: permissions, missing
955            //           file, broken symlink, /proc EACCES, etc. became
956            //           "empty content", the scan emitted zero hits, and
957            //           the run exited 0. A secret-scanning CI control
958            //           must NOT silently pass on unreadable files; the
959            //           operator needs to know they had no signal.
960            // TS map:   `try { ... } catch (e) { return [makeError(p, e)] }`.
961            //
962            // In TS you'd write (pseudocode):
963            // ```ts
964            // try { content = await readFile(p); }
965            // catch (e) { return [`${p}: read error: ${e}`]; }
966            // ```
967            let content = match read_with_binary_check(p) {
968                Ok(c) => c,
969                Err(e) => {
970                    return vec![format!("{}: read error: {}", p, e)];
971                }
972            };
973            // What:     `scan_content(p, &content, &ruleset)` is a function
974            //           call. `&content` and `&ruleset` are BORROW
975            //           expressions: we lend the vec and ruleset to the
976            //           callee read-only. The callee returns a fresh
977            //           `Vec<String>` of hits which becomes this closure's
978            //           tail expression (no `;` -> implicit return).
979            // Why:      Hand the just-read bytes to the scanner; the
980            //           returned hits become this closure's contribution
981            //           to the parallel-flat_map output.
982            // TS map:   `return scanContent(p, content, ruleset);`.
983            //
984            // In TS you'd write (pseudocode):
985            // ```ts
986            // return scanContent(p, content, ruleset);
987            // ```
988            scan_content(p, &content, &ruleset)
989        })
990        .collect();
991
992    // What:     `std::io::stderr().lock()` returns a `StderrLock`, an
993    //           RAII handle holding the stderr mutex. Held writes
994    //           don't interleave with other threads.
995    // Why:      Print all hits in one batch.
996    // TS map:   No equivalent; Node has no stderr lock concept.
997    //
998    // In TS you'd write (pseudocode):
999    // ```ts
1000    // for (const h of hits) process.stderr.write(h + "\n");
1001    // ```
1002    let stderr = std::io::stderr();
1003    let mut handle = stderr.lock();
1004    for h in &hits {
1005        let _ = writeln!(handle, "{}", h);
1006    }
1007
1008    // What:     `if hits.is_empty() { Ok(0) } else { Ok(1) }`.
1009    //           This is an `if`-as-EXPRESSION (not statement) with no
1010    //           trailing `;`: its value becomes the function's return.
1011    //           `Ok(0)` and `Ok(1)` construct the success variant of
1012    //           `Result<i32, String>` with the OS-exit code inside; the
1013    //           bin wrapper converts to `ExitCode` for the actual exit.
1014    // Why:      No hits = clean exit; one or more hits = "violation"
1015    //           exit so CI marks the run as failed.
1016    // TS map:   `return hits.length === 0 ? 0 : 1;`.
1017    //
1018    // In TS you'd write (pseudocode):
1019    // ```ts
1020    // return hits.length === 0 ? 0 : 1;
1021    // ```
1022    if hits.is_empty() {
1023        Ok(0)
1024    } else {
1025        Ok(1)
1026    }
1027}