forbidden_strings/lib.rs
1// TODO: deferred perf work. See /home/user/.claude/plans/dapper-coalescing-horizon.md.
2// TODO: - L2: line-start index for `line_and_col` -- only matters on the
3// TODO: violation path; revisit if a single file with many hits ever becomes
4// TODO: a real workload.
5// TODO: - Z1: serialize the regex-bucket combined DFA. Resharp 0.5 has no
6// TODO: serialization API; would require swapping that gate to the `regex`
7// TODO: crate (`regex-automata::dfa::dense::DFA::to_bytes`). Trigger: when
8// TODO: startup-only time goes back over ~100ms after P1+P2 land.
9
10// What: `mod walk;` declares a child module whose source lives in
11// `walk.rs` (sibling to this file). `mod` is Rust's module
12// system: it does NOT import names; it simply tells the
13// compiler "this file/module exists, compile it". Names
14// referenced via `crate::walk::xxx` afterward.
15// Why: We split the binary into four files so each unit is
16// focused: `walk.rs` for the working-tree walker that
17// respects `.gitignore`.
18// TS map: Closer to a tsconfig file's "include" entry than to an
19// `import`. The actual `import` happens via the `use` lines
20// below.
21// Gotcha: `mod foo;` without a body is NOT an import; it's a
22// registration. Forgetting to write `mod` for a sibling file
23// silently excludes it from the build.
24//
25// In TS you'd write (pseudocode):
26// ```ts
27// // No equivalent. Closest: TypeScript automatically picks up files
28// // in `include` paths; Rust requires explicit `mod` declarations.
29// ```
30mod rules;
31mod scan;
32mod scan_format;
33mod walk;
34
35// What: `#[cfg(feature = "fuzzing")] pub mod fuzz_api;` registers
36// the curated re-export module ONLY when the `fuzzing`
37// Cargo feature is on. The bin target leaves the feature
38// off and never sees this module; fuzz targets (in
39// `fuzz/Cargo.toml`) enable the feature and use
40// `forbidden_strings::fuzz_api::*` to import internals.
41// Why: Keep the production public surface unchanged while
42// letting fuzz targets reach the internal helpers they
43// need.
44// TS map: `if (process.env.FUZZING) { export * as fuzzApi from "./fuzz_api"; }`
45// in spirit; no clean 1:1 equivalent because TS has no
46// compile-time feature gates.
47//
48// In TS you'd write (pseudocode):
49// ```ts
50// // No clean equivalent; conditional re-export at build time.
51// ```
52#[cfg(feature = "fuzzing")]
53pub mod fuzz_api;
54
55// What: `use std::env;` imports the std `env` module so we can
56// reference `env::args` / `env::var`.
57// Why: Reading argv and environment variables.
58// TS map: `import { argv, env } from "node:process";`.
59//
60// In TS you'd write (pseudocode):
61// ```ts
62// import { argv, env } from "node:process";
63// ```
64use std::env;
65
66// What: `std::fs::canonicalize` is referenced via the full path at
67// three sites (rules-path skip set, walker skip lookup); no
68// bare `use std::fs;` because the per-file `fs::read` slurp
69// moved into `read_with_binary_check` which uses
70// `std::fs::File` directly.
71// Why: Background on file-reading performance choices: `fs::read`
72// is empirically faster than `mmap`-based access on this
73// workload (many small files; per-file VMA setup cost
74// dominates the saved alloc) -- the E2 mmap experiment
75// regressed wall time by 35% on Mono and 43% on the Linux
76// kernel. See PERF.md "Mmap experiment (rejected)".
77// Thread-local scratch buffers were also tried 2026-05-03;
78// rayon's nested-parallelism work-stealing (scan_content
79// uses inner par_iter via prefix-matched and combined-
80// shard fan-out) re-entered the outer flat_map_iter on
81// the SAME thread while the buffer was borrowed,
82// triggering a `RefCell already borrowed` panic. The
83// per-file alloc cost is dwarfed by the unicode-mode
84// speedup; not worth the re-entrancy hazard.
85// TS map: N/A (no import line; `std::fs::canonicalize` is fully
86// qualified at use sites).
87
88// What: `use std::io::Write;` imports the `Write` TRAIT (interface-
89// like). Methods declared by a trait are only callable when
90// the trait is in scope, even when used via macros like
91// `writeln!`.
92// Why: We use `writeln!(handle, ...)` to emit hits.
93// TS map: No 1:1 equivalent; in TS, methods are always callable.
94//
95// In TS you'd write (pseudocode):
96// ```ts
97// // Unnecessary in TS.
98// ```
99use std::io::Write;
100
101// What: `use rayon::prelude::*;` brings rayon's parallel-iterator
102// extension methods into scope (`par_iter`, `flat_map_iter`,
103// etc.).
104// Why: The two-phase main loop uses `par_iter` for both the
105// parallel-read phase and the parallel-scan phase.
106// TS map: No equivalent.
107//
108// In TS you'd write (pseudocode):
109// ```ts
110// // No equivalent.
111// ```
112use rayon::prelude::*;
113
114// What: `use crate::walk::list_files;` re-exports the named function
115// from the sibling module under a short alias for local use.
116// `crate::` is the absolute root of this crate.
117// Why: We call `list_files(".")` once when `--all` mode is
118// selected to enumerate every scannable file.
119// TS map: `import { listFiles } from "./walk";`.
120//
121// In TS you'd write (pseudocode):
122// ```ts
123// import { listFiles } from "./walk";
124// ```
125use crate::rules::load_ruleset;
126use crate::scan::scan_content;
127use crate::walk::list_files;
128
129// What: `fn build_skip_set(rules_path: &str) -> HashSet<PathBuf>`
130// returns the set of CANONICAL absolute paths to skip when
131// walking the tree in `--all` mode. Pre-fix this logic was a
132// basename check (`is_skipped_file`) that matched anywhere in
133// the tree, so an unrelated `sub/forbidden-strings.local.txt`
134// was silently dropped along with the actual rule file. Path-
135// anchored matching pins each skip to its specific filesystem
136// location.
137// Why: Closes BUG 6 (basename skip applies to arbitrary explicit
138// args) and BUG 11 (Windows path basename via rsplit('/')) in
139// one shape change. Path-anchoring removes both failure modes:
140// the basename collision cannot trigger because we compare
141// full canonical paths, and the Windows backslash separator
142// is handled inside `std::fs::canonicalize` / `PathBuf::eq`.
143//
144// Skip set composition:
145// - The actual rules file (whatever the user passed via
146// `--rules` or `FORBIDDEN_STRINGS_RULES`; falls back to
147// the default `forbidden-strings.local.txt` in cwd).
148// - Four canonical self-match paths at their expected
149// locations relative to repo root. Each file contains
150// literal copies of rule bodies (generated source) or
151// documented example matches (rules-engine test
152// fixtures); scanning them in --all mode produces noise.
153// If running from a different cwd they fail to
154// canonicalize and are silently dropped from the set;
155// matching is still correct for the rules file alone.
156//
157// The caller separately decides WHEN to apply the skip:
158// explicit positional args are NEVER skipped (the user asked
159// for them); only walker output in --all mode is filtered.
160// TS map: `function buildSkipSet(rulesPath: string): Set<string>`.
161//
162// In TS you'd write (pseudocode):
163// ```ts
164// function buildSkipSet(rulesPath: string): Set<string> {
165// const set = new Set<string>();
166// try { set.add(fs.realpathSync(rulesPath)); } catch {}
167// for (const k of CANONICAL_SELF_MATCH_PATHS) {
168// try { set.add(fs.realpathSync(k)); } catch {}
169// }
170// return set;
171// }
172// ```
173fn build_skip_set(rules_path: &str) -> std::collections::HashSet<std::path::PathBuf> {
174 // What: `let mut set: HashSet<PathBuf> = HashSet::new();` -- the
175 // usual mutable-empty-collection pattern.
176 // Why: Accumulate canonical-form paths we want to skip.
177 // TS map: `const set = new Set<string>();`.
178 //
179 // In TS you'd write (pseudocode):
180 // ```ts
181 // const set = new Set<string>();
182 // ```
183 let mut set: std::collections::HashSet<std::path::PathBuf> =
184 std::collections::HashSet::new();
185
186 // What: `if let Ok(p) = std::fs::canonicalize(rules_path) { set.insert(p); }`.
187 // `canonicalize` resolves symlinks AND makes the path
188 // absolute; identical files reached via different
189 // relative paths compare equal at the canonical level.
190 // A missing rules file would fail to canonicalize -- the
191 // loader will surface that error separately via
192 // `load_ruleset`, so we silently skip the insertion here.
193 // Why: Anchor the skip on the actual filesystem identity of
194 // the rules file rather than its basename.
195 // TS map: `try { set.add(fs.realpathSync(rulesPath)); } catch {}`.
196 //
197 // In TS you'd write (pseudocode):
198 // ```ts
199 // try { set.add(fs.realpathSync(rulesPath)); } catch {}
200 // ```
201 if let Ok(p) = std::fs::canonicalize(rules_path) {
202 set.insert(p);
203 }
204 // What: Canonical self-match paths relative to the repo root.
205 // Each is a file we know contains literal copies of rule
206 // bodies (generated source) or documented example match
207 // strings (rules-engine test fixtures); scanning them in
208 // --all mode would produce self-matches. Pinned by their
209 // expected location so the matcher does not fire on
210 // unrelated files of the same name elsewhere in the tree.
211 // Why: Same anti-self-match guard as the previous basename
212 // list, but anchored to specific paths. If the binary is
213 // run from outside the monorepo or these files have been
214 // relocated, canonicalize fails and the entry is dropped
215 // -- still no false negative because the file does not
216 // exist at the expected location, so the walker would
217 // not encounter it either.
218 // TS map: constant string array of canonical paths.
219 //
220 // In TS you'd write (pseudocode):
221 // ```ts
222 // const CANONICAL_SELF_MATCH_PATHS = [ "...", "...", "...", "..." ];
223 // ```
224 let canonical_self_match_paths = [
225 "packages/cli/forbidden-strings/data/betterleaks-default-config.toml",
226 "packages/cli/forbidden-strings/src/port-betterleaks-relaxations.ts",
227 "forbidden-strings.local.example.txt",
228 "packages/cli/forbidden-strings/src/rules/algebra_tests.rs",
229 ];
230 for k in canonical_self_match_paths {
231 if let Ok(p) = std::fs::canonicalize(k) {
232 set.insert(p);
233 }
234 }
235 set
236}
237
238// What: `fn is_walker_skipped(path: &str, skip_set: &HashSet<PathBuf>) -> bool`
239// returns true when the path's canonical form matches a
240// skip-set entry. Used ONLY for walker output in --all mode;
241// explicit positional args bypass this check entirely.
242// Why: Closes BUG 6: the previous `is_skipped_file` ran on every
243// queued path regardless of source, hiding real positive
244// findings on `sub/forbidden-strings.local.txt`-style explicit
245// args. The path-anchored form here is consulted only when
246// the caller knows the path came from the walker.
247// TS map: `function isWalkerSkipped(path: string, skipSet: Set<string>): boolean`.
248//
249// In TS you'd write (pseudocode):
250// ```ts
251// function isWalkerSkipped(path: string, skipSet: Set<string>): boolean {
252// try {
253// const canonical = fs.realpathSync(path);
254// return skipSet.has(canonical);
255// } catch { return false; }
256// }
257// ```
258fn is_walker_skipped(
259 path: &str,
260 skip_set: &std::collections::HashSet<std::path::PathBuf>,
261) -> bool {
262 // What: Canonicalize per file and lookup in the skip set. A
263 // canonicalize failure (broken symlink, vanished file)
264 // returns false -- if we cannot resolve the path, we are
265 // definitely not skipping it. The downstream `fs::read`
266 // will surface any read error via the BUG 4 fix.
267 // Why: Per-file canonicalize is one stat syscall; with the
268 // ~2700-file walked corpus, that's a few ms total --
269 // well under the scan cost itself.
270 // TS map: try/catch around realpathSync.
271 //
272 // In TS you'd write (pseudocode):
273 // ```ts
274 // try {
275 // const canonical = fs.realpathSync(path);
276 // return skipSet.has(canonical);
277 // } catch { return false; }
278 // ```
279 if let Ok(canonical) = std::fs::canonicalize(path) {
280 return skip_set.contains(&canonical);
281 }
282 false
283}
284
285// What: `BIN_PROBE_SIZE` is the byte length read up-front from every
286// file before deciding whether the file is binary. 8 KiB is
287// the same probe size the pre-BUG-5 `is_likely_binary`
288// heuristic used; it matches `git diff`'s "binary or text"
289// heuristic threshold.
290// Why: The probe length tunes a tradeoff: smaller probe lets a
291// binary file with a leading text header (PDF header,
292// machine-O header) sneak past as text; larger probe wastes
293// memory on small files. 8 KiB catches the common cases
294// (PNG, JPG, ELF, WASM, zip, ZSTD frames) and is the
295// established convention.
296const BIN_PROBE_SIZE: usize = 8192;
297
298// What: `read_with_binary_check(path)` reads a file under a binary
299// heuristic:
300// 1. Always read the first `BIN_PROBE_SIZE` bytes.
301// 2. If the file is smaller than that, return what we got.
302// 3. If the probe contains a NUL byte and the file is
303// larger than the probe, return only the probe (the
304// rest is treated as binary tail and not scanned).
305// 4. Otherwise (probe is NUL-free), read and return the
306// full file.
307// Why: Closes the BUG-5 regression without re-introducing the
308// soundness gap that BUG 5 fixed. BUG 5 removed a heuristic
309// that threw away the WHOLE file when the first 8 KiB
310// contained a NUL byte; that masked secrets sitting BEFORE
311// the NUL. This rule keeps that signal (the first 8 KiB is
312// always scanned), but caps the per-file work on large
313// binary blobs (firmware images, vmlinuz, font caches, lock
314// sidecars) at 8 KiB instead of full content. Acceptable
315// miss: a secret living AFTER a NUL byte in a file that is
316// ALSO larger than 8 KiB. Acceptable: those files are the
317// "binary blob with bytes that happen to spell a secret"
318// case, and the secret-leak risk is dominated by source
319// files and small lock files which still scan in full.
320// TS map: `function readWithBinaryCheck(path: string): Buffer`.
321//
322// In TS you'd write (pseudocode):
323// ```ts
324// function readWithBinaryCheck(path: string): Buffer {
325// const fd = fs.openSync(path, "r");
326// try {
327// const probe = Buffer.alloc(BIN_PROBE_SIZE);
328// const n = fs.readSync(fd, probe, 0, BIN_PROBE_SIZE, null);
329// if (n < BIN_PROBE_SIZE) return probe.subarray(0, n);
330// if (probe.indexOf(0) !== -1) return probe;
331// return Buffer.concat([probe, fs.readSync.readRestOf(fd)]);
332// } finally {
333// fs.closeSync(fd);
334// }
335// }
336// ```
337fn read_with_binary_check(path: &str) -> Result<Vec<u8>, std::io::Error> {
338 use std::fs::File;
339 use std::io::Read;
340
341 let mut file = File::open(path)?;
342 let mut buf: Vec<u8> = Vec::with_capacity(BIN_PROBE_SIZE);
343 (&mut file)
344 .take(BIN_PROBE_SIZE as u64)
345 .read_to_end(&mut buf)?;
346
347 if buf.len() < BIN_PROBE_SIZE {
348 return Ok(buf);
349 }
350
351 if memchr::memchr(0, &buf).is_some() {
352 return Ok(buf);
353 }
354
355 file.read_to_end(&mut buf)?;
356 Ok(buf)
357}
358
359// What: `pub fn run_cli_from_env() -> Result<i32, String>` is the
360// library entry point. It reads `env::args()` and env vars,
361// parses flags, loads the ruleset, runs the parallel scan,
362// prints hits to stderr, and returns the exit code the OS
363// should see. `Result<i32, String>` lets the binary thin
364// wrapper decide how to report a catastrophic failure (the
365// `Err` arm) versus a regular run (`Ok(0)` clean,
366// `Ok(1)` violation, `Ok(2)` usage error already eprinted).
367// Sibling shape considered: returning `ExitCode` directly --
368// rejected because tests written against the lib want a
369// plain `i32` they can compare, and `ExitCode` has no `Eq`.
370// Why: Coordinate arg parsing, ruleset loading, parallel scan,
371// and result reporting from a unit testable surface. The bin
372// target's `main` is now a five-line wrapper that turns the
373// returned code into an `ExitCode` and prints `Err` to
374// stderr with a fixed prefix.
375// TS map: No entry-point function in TS; Node scripts just run top-
376// to-bottom. Mentally picture an
377// `async function runCliFromEnv(): Promise<number>` that
378// the bin's tiny wrapper awaits and passes to
379// `process.exit`.
380//
381// In TS you'd write (pseudocode):
382// ```ts
383// async function runCliFromEnv(): Promise<number> {
384// // ...
385// return anyViolation ? 1 : 0;
386// }
387// process.exit(await runCliFromEnv());
388// ```
389pub fn run_cli_from_env() -> Result<i32, String> {
390 // What: `let args: Vec<String> = env::args().skip(1).collect();`
391 // reads command-line arguments. `env::args()` returns an
392 // iterator of `String`s where index 0 is the program name
393 // ("forbidden-strings"); `.skip(1)` drops it; `.collect()`
394 // materializes the remainder into a `Vec<String>`. The
395 // explicit `Vec<String>` annotation tells `.collect()` what
396 // container to build (without it, the collect call is
397 // ambiguous). Sibling type to consider: `Vec<&str>` would
398 // BORROW the args, but `env::args()` already yields owned
399 // `String`s -- borrowing is not an option here.
400 // Why: We need the user's actual flags/files; the program name
401 // is irrelevant.
402 // TS map: `const args = process.argv.slice(2);`.
403 //
404 // In TS you'd write (pseudocode):
405 // ```ts
406 // const args: string[] = process.argv.slice(2);
407 // ```
408 let args: Vec<String> = env::args().skip(1).collect();
409
410 // What: `let mut rules_path: Option<String> = env::var("...").ok();`
411 // reads an environment variable. `env::var` returns
412 // `Result<String, VarError>` (Err if unset); `.ok()` converts
413 // it into `Option<String>` -- `Some(value)` if set, `None`
414 // otherwise. The `mut` lets us reassign `rules_path` later if
415 // `--rules` overrides. Sibling type: `Option<&str>` would
416 // need the env value to live somewhere else; `String` is
417 // owned so it can outlive any function call.
418 // Why: Initial source for the rules-file path; `--rules` flag
419 // takes precedence and overwrites this.
420 // TS map: `let rulesPath: string | undefined = process.env.FORBIDDEN_STRINGS_RULES;`.
421 //
422 // In TS you'd write (pseudocode):
423 // ```ts
424 // let rulesPath: string | undefined = process.env.FORBIDDEN_STRINGS_RULES;
425 // ```
426 let mut rules_path: Option<String> = env::var("FORBIDDEN_STRINGS_RULES").ok();
427
428 // What: `let mut all = false;` declares a mutable boolean. No
429 // type annotation needed -- the literal `false` infers `bool`.
430 // Why: Tracks whether `--all` was passed; we toggle it to true
431 // when we encounter the flag.
432 // TS map: `let all = false;`.
433 //
434 // In TS you'd write (pseudocode):
435 // ```ts
436 // let all = false;
437 // ```
438 let mut all = false;
439
440 // What: `let mut files: Vec<String> = Vec::new();` allocates an
441 // empty growable, owned vector of `String`. `Vec::new()` is
442 // the empty-vector constructor; the explicit type annotation
443 // tells the compiler the element type since the empty
444 // constructor cannot infer it. Sibling: `Vec<&str>` cannot
445 // hold values that outlive the source; we want owned data.
446 // Why: Accumulates positional file arguments as we parse argv.
447 // TS map: `const files: string[] = [];`.
448 //
449 // In TS you'd write (pseudocode):
450 // ```ts
451 // const files: string[] = [];
452 // ```
453 let mut files: Vec<String> = Vec::new();
454
455 // What: `let mut i: usize = 0;` declares a mutable index counter.
456 // `usize` is the unsigned integer wide enough to address any
457 // byte in memory on this platform (32 bits on 32-bit OS,
458 // 64 bits on 64-bit OS). Siblings the reader might expect:
459 // `u32`, `u64`, `i32`, `i64`. Why `usize` not `u64`? Every
460 // std API that takes a "size" or "index" wants `usize`;
461 // mixing widths forces casts.
462 // Why: Manual index lets us advance by 2 (consume `--rules` plus
463 // its value) inside the loop body.
464 // TS map: `let i = 0;` (TS has only one number type).
465 //
466 // In TS you'd write (pseudocode):
467 // ```ts
468 // let i = 0;
469 // ```
470 let mut i: usize = 0;
471
472 // What: `while i < args.len() { ... }` is a basic conditional loop.
473 // No iterator, no syntactic sugar -- just "keep going while
474 // condition holds". `args.len()` returns the vector's length
475 // as `usize`.
476 // Why: We need manual index control to consume `--rules` plus
477 // its argument together; a `for arg in &args` loop cannot
478 // skip ahead.
479 // TS map: `while (i < args.length) { ... }`.
480 //
481 // In TS you'd write (pseudocode):
482 // ```ts
483 // while (i < args.length) { ... }
484 // ```
485 while i < args.len() {
486 // What: `let a = &args[i];` borrows the i-th element. `&` is
487 // Rust's "borrow" operator: it gives a read-only
488 // reference to the value without taking ownership; the
489 // original vector still owns the `String`. Without `&`,
490 // Rust would try to MOVE the `String` out of the vector,
491 // which is illegal because `Vec<String>` does not
492 // support hole-poking moves.
493 // Why: We want to inspect the arg's contents (compare to
494 // "--rules", etc.) without consuming it.
495 // TS map: `const a = args[i];` -- TS has no ownership system,
496 // so reading is always implicitly "borrowing".
497 //
498 // In TS you'd write (pseudocode):
499 // ```ts
500 // const a = args[i];
501 // ```
502 let a = &args[i];
503 if a == "--rules" {
504 i += 1;
505 if i >= args.len() {
506 eprintln!("--rules needs an argument");
507 // What: `return Ok(2);` early-exits `run_cli_from_env`
508 // with the eventual OS exit code 2. `Ok(...)`
509 // wraps the `i32` into the success variant of
510 // `Result<i32, String>`; the bin wrapper turns
511 // it into `ExitCode::from(2)`.
512 // Why: Convention: 0 = success, 1 = violation,
513 // 2 = usage / config error. The usage message
514 // was already printed on the previous line.
515 // TS map: `return 2;`.
516 //
517 // In TS you'd write (pseudocode):
518 // ```ts
519 // return 2;
520 // ```
521 return Ok(2);
522 }
523 // What: `rules_path = Some(args[i].clone());` reassigns
524 // the `Option<String>` variable. `Some(...)` wraps
525 // a value into the present variant of `Option`;
526 // `args[i].clone()` deep-copies the indexed `String`
527 // so the assignment OWNS its bytes (we cannot move
528 // out of a Vec, and a borrow would tie `rules_path`
529 // to `args`'s lifetime).
530 // Why: Capture the argument that follows `--rules` as
531 // our authoritative rules path.
532 // TS map: `rulesPath = args[i];` -- TS strings are GC'd, no
533 // clone needed.
534 //
535 // In TS you'd write (pseudocode):
536 // ```ts
537 // rulesPath = args[i];
538 // ```
539 rules_path = Some(args[i].clone());
540 } else if a == "--all" {
541 all = true;
542 } else if a == "--help" || a == "-h" {
543 // What: `concat!` is a compile-time macro joining string
544 // literals into a single `&'static str`. The `!`
545 // marks it as a macro call, not a function call.
546 // `env!("CARGO_PKG_VERSION")` reads `version` from
547 // Cargo.toml at compile time and inlines it as a
548 // string literal.
549 // Why: Print a single static help string with the version
550 // baked in, no runtime allocation, no formatter.
551 // TS map: The TS analogue is template-literal concatenation
552 // plus `process.env.npm_package_version` (read at
553 // build time via a bundler define), but TS has no
554 // macro system -- the closest mental model is
555 // "compiled-in string template".
556 //
557 // In TS you'd write (pseudocode):
558 // ```ts
559 // const VERSION = process.env.npm_package_version!;
560 // const HELP = `forbidden-strings ${VERSION}\n...`;
561 // console.log(HELP);
562 // ```
563 println!(
564 "{}",
565 concat!(
566 "forbidden-strings ", env!("CARGO_PKG_VERSION"), "\n",
567 "Linear-time deny-list scanner for Git repos.\n",
568 "\n",
569 "USAGE:\n",
570 " forbidden-strings [--rules <PATH>] [--all] [FILE...]\n",
571 "\n",
572 "FLAGS:\n",
573 " --rules <PATH> Path to the rule file (one rule per line).\n",
574 " Overrides FORBIDDEN_STRINGS_RULES.\n",
575 " Default: ./forbidden-strings.local.txt\n",
576 " --all Scan every git-tracked file under cwd.\n",
577 " Respects .gitignore (via the `ignore` crate).\n",
578 " -h, --help Print this help and exit.\n",
579 " -V, --version Print version and exit.\n",
580 "\n",
581 "ENV:\n",
582 " FORBIDDEN_STRINGS_RULES Default rules path; --rules wins if both are set.\n",
583 " If unset, falls back to ./forbidden-strings.local.txt\n",
584 "\n",
585 "EXIT CODES:\n",
586 " 0 No violations.\n",
587 " 1 One or more violations (printed to stderr, redacted).\n",
588 " 2 Usage error or rule-file error.\n",
589 "\n",
590 "EXAMPLES:\n",
591 " # Scan a few files\n",
592 " forbidden-strings --rules ./rules.txt src/main.ts README.md\n",
593 "\n",
594 " # Scan the whole working tree\n",
595 " FORBIDDEN_STRINGS_RULES=./rules.txt forbidden-strings --all\n",
596 "\n",
597 "RULE FORMAT:\n",
598 " Bare line -> case-sensitive literal substring\n",
599 " /PATTERN/FLAGS -> regex (resharp; supports A&B, ~(A), (?=...), (?<=...))\n",
600 " # ... -> comment\n",
601 " Empty line -> skipped\n",
602 "\n",
603 "RESHARP LIMITATIONS (0.5.x through 0.6.x):\n",
604 " A `~(...)` complement body cannot contain `\\b`, `\\B`, `^`, `$`,\n",
605 " or any user-explicit lookaround. Use `\\W` or literal whitespace for\n",
606 " `\\b`; `\\A`/`\\z` for `^`/`$` when whole-content semantics fit; or\n",
607 " lift the boundary check outside the complement. Loader rejects every\n",
608 " failing shape with a named-trigger error. See TROUBLESHOOTING.resharp.md.\n",
609 "\n",
610 "OUTPUT:\n",
611 " PATH:LINE:COL_START..COL_END rule=N (matched substring is NEVER printed)\n",
612 "\n",
613 "See README.md for set-algebra rule examples and CI integration.\n",
614 ),
615 );
616 return Ok(0);
617 } else if a == "--version" || a == "-V" {
618 // What: Same `concat!` + `env!` trick: compile-time string
619 // literal, no runtime cost. `env!` panics at compile
620 // time if `CARGO_PKG_VERSION` is unset, which is
621 // impossible inside a Cargo build.
622 // Why: Match `cargo`/`rustc` convention -- `--version`
623 // prints `<name> <semver>` on stdout.
624 // TS map: `console.log(`forbidden-strings ${VERSION}`)`.
625 //
626 // In TS you'd write (pseudocode):
627 // ```ts
628 // console.log(`forbidden-strings ${VERSION}`);
629 // ```
630 println!("forbidden-strings {}", env!("CARGO_PKG_VERSION"));
631 return Ok(0);
632 } else if a.starts_with("--") || a.starts_with("-") && a.len() > 1 {
633 eprintln!("unknown flag {}", a);
634 return Ok(2);
635 } else {
636 // What: `files.push(a.clone())`. `a` is a `&String`
637 // (borrowed); `.clone()` deep-copies the `String`
638 // so the new owned copy can be moved into the
639 // vector. We cannot push the borrow itself --
640 // `Vec<String>` requires owned `String`s and the
641 // borrow's lifetime would not outlive `args`.
642 // Why: Stash the positional file argument for later
643 // scanning.
644 // TS map: `files.push(a);` -- TS strings are GC'd; no clone.
645 //
646 // In TS you'd write (pseudocode):
647 // ```ts
648 // files.push(a);
649 // ```
650 files.push(a.clone());
651 }
652 // What: `i += 1;` advances to the next argv slot. Plain
653 // integer increment; no Rust-specific magic.
654 // Why: Move past the just-consumed flag/value.
655 // TS map: `i += 1;`.
656 //
657 // In TS you'd write (pseudocode):
658 // ```ts
659 // i += 1;
660 // ```
661 i += 1;
662 }
663
664 // What: `unwrap_or_else(|| ...)` returns the inner `Some` value or
665 // runs the closure to produce a fallback. The closure body
666 // is a string literal converted to `String` via `.to_string()`.
667 // Why: Default the rules path to `forbidden-strings.local.txt` in
668 // cwd when neither `--rules` nor `FORBIDDEN_STRINGS_RULES`
669 // is set, matching the conventional filename. The loader
670 // emits a clear "file not found" error if the default
671 // doesn't exist; we don't pre-check and shadow that error.
672 // TS map: `rulesPath ?? "forbidden-strings.local.txt"`.
673 //
674 // In TS you'd write (pseudocode):
675 // ```ts
676 // const finalRulesPath = rulesPath ?? "forbidden-strings.local.txt";
677 // ```
678 let rules_path = rules_path.unwrap_or_else(|| "forbidden-strings.local.txt".to_string());
679
680 // Run `load_ruleset` and `list_files` concurrently when --all is
681 // set: rules loading is CPU-bound (regex compile + AC build);
682 // file walking is I/O-bound (directory traversal + gitignore parse).
683 // They share no state, so overlapping them shaves whichever side
684 // is shorter.
685 // What: `rayon::join(|| f1(), || f2())` runs two closures in
686 // parallel using the rayon threadpool. Returns a tuple
687 // of their return values once both finish. If only one
688 // closure has substantial work (e.g. when --all is off,
689 // we have no file walk to do), join still runs both --
690 // but the empty closure adds negligible cost.
691 // Why: Rules load is ~12ms for a 1k-rule ruleset; file walk
692 // is ~7ms on this repo. Sequential = 19ms; parallel = 12ms.
693 // TS map: `await Promise.all([loadRuleset(rulesPath), listFiles(".")])`.
694 //
695 // In TS you'd write (pseudocode):
696 // ```ts
697 // const [rulesetResult, filesResult] = await Promise.all([
698 // loadRuleset(rulesPath),
699 // all ? listFiles(".") : Promise.resolve(null),
700 // ]);
701 // ```
702 let (ruleset_result, listed_result): (Result<_, String>, Option<Result<Vec<String>, String>>) =
703 rayon::join(
704 || load_ruleset(&rules_path),
705 || if all { Some(list_files(".")) } else { None },
706 );
707
708 // What: `let ruleset = match ruleset_result { Ok(r) => r, Err(e) => { ...; return ... } };`
709 // is a `match` expression destructuring a `Result<RuleSet, String>`.
710 // `Ok(r)` binds the success payload to local `r` and
711 // "evaluates" the arm to that value; `Err(e)` binds the
712 // failure payload, prints it, and early-returns from
713 // `main`. The match expression as a whole evaluates to
714 // the `Ok` arm's value; assigning it to `ruleset` gives us
715 // a plain `RuleSet` to use below (no more wrapper).
716 // Why: Unwrap the `Result` while presenting a friendly error to
717 // the user instead of a panic.
718 // TS map: `try { ruleset = await loadRuleset(...); } catch (e) { console.error(...); process.exit(2); }`.
719 //
720 // In TS you'd write (pseudocode):
721 // ```ts
722 // let ruleset: RuleSet;
723 // try { ruleset = rulesetResult; }
724 // catch (e) { console.error(`forbidden-strings: ${e}`); process.exit(2); }
725 // ```
726 let ruleset = match ruleset_result {
727 Ok(r) => r,
728 Err(e) => {
729 eprintln!("forbidden-strings: {}", e);
730 return Ok(2);
731 }
732 };
733
734 if env::var("FORBIDDEN_STRINGS_DEBUG_BUCKETS").is_ok() {
735 let ac_cs_pat = ruleset.ac_meta.iter().filter(|m| matches!(m, crate::rules::AcMeta::RegexPrefix { .. })).count();
736 let ac_cs_lit = ruleset.ac_meta.iter().filter(|m| matches!(m, crate::rules::AcMeta::Literal { .. })).count();
737 let ac_ci_pat = ruleset.ac_meta_ci.len();
738 let residual_count: usize = ruleset.residual_shards.iter().map(|s| match s {
739 crate::rules::ResidualShard::Single { .. } => 1,
740 crate::rules::ResidualShard::Combined { positions, .. } => positions.len(),
741 }).sum();
742 let single_shard_count = ruleset.residual_shards.iter().filter(|s| matches!(s, crate::rules::ResidualShard::Single { .. })).count();
743 let combined_shard_count = ruleset.residual_shards.len() - single_shard_count;
744 eprintln!(
745 "forbidden-strings buckets: ac_cs_lit={} ac_cs_regex_prefix={} ac_ci_regex_prefix={} residual={} (in {} single + {} combined shards) regex_rules_total={}",
746 ac_cs_lit, ac_cs_pat, ac_ci_pat, residual_count, single_shard_count, combined_shard_count, ruleset.regex_rules.len(),
747 );
748 if env::var("FORBIDDEN_STRINGS_DEBUG_RESIDUAL_LIST").is_ok() {
749 for shard in &ruleset.residual_shards {
750 let positions: Vec<usize> = match shard {
751 crate::rules::ResidualShard::Single { rule_pos } => vec![*rule_pos],
752 crate::rules::ResidualShard::Combined { positions, .. } => positions.clone(),
753 };
754 for pos in positions {
755 let r = &ruleset.regex_rules[pos];
756 eprintln!("residual rule line={}", r.idx);
757 }
758 }
759 }
760 }
761
762 // What: `if let Some(listed) = listed_result { match listed { ... } }`.
763 // One-arm pattern match: enter the block ONLY when
764 // `listed_result` is `Some`, binding the inner
765 // `Result<Vec<String>, String>` to `listed`. Inside, a
766 // regular `match` extracts `Ok` (replace `files` with the
767 // walker's output) or `Err` (print, exit 2).
768 // Why: `listed_result` is `Some(...)` only when `--all` was
769 // passed; otherwise `None` and we skip silently, leaving
770 // `files` set to whatever came from positional args.
771 // TS map: `if (listedResult) { try { files = listedResult; } catch (e) { ... } }`.
772 //
773 // In TS you'd write (pseudocode):
774 // ```ts
775 // if (listedResult !== null) {
776 // try { files = listedResult; }
777 // catch (e) { console.error(`forbidden-strings: ${e}`); process.exit(2); }
778 // }
779 // ```
780 if let Some(listed) = listed_result {
781 match listed {
782 Ok(f) => files = f,
783 Err(e) => {
784 eprintln!("forbidden-strings: {}", e);
785 return Ok(2);
786 }
787 }
788 }
789
790 // Fused read+scan: each rayon thread maps one file's bytes
791 // (via mmap; falls back to `fs::read` if mmap fails) and
792 // immediately scans them. The two-phase split that used to live
793 // here (Phase A reads, Phase B scans) traded cache locality for
794 // a clean separation but produced no measurable speedup -- after
795 // P1 the AC scan is so fast that file bytes go from disk to AC to
796 // discard within tens of microseconds. Fusing keeps each file's
797 // bytes hot in L1/L2 across the read->scan boundary instead of
798 // risking eviction during the materialize-then-iterate round trip.
799 // What: `files.par_iter().flat_map_iter(|p| { try mmap(p); scan_content(p, &bytes, &rs) }).collect::<Vec<String>>()`
800 // runs map+scan as one rayon work unit per file. The
801 // closure's `Mmap` (or `Vec<u8>` fallback) lives only
802 // until the scan finishes for that file; rayon
803 // work-steals across cores.
804 // Why: Mmap saves the alloc + memcpy that `fs::read` does.
805 // On a hot page cache, that's measurable on `--all`;
806 // on a cold cache, MADV_SEQUENTIAL lets the kernel
807 // readahead-pipeline files. Fallback to `fs::read`
808 // handles the cases mmap can't (empty files, /proc
809 // entries, character devices).
810 // TS map: `(await Promise.all(files.map(async (p) => scanContent(p, await readFileFastest(p), rs)))).flat()`.
811 //
812 // In TS you'd write (pseudocode):
813 // ```ts
814 // const hits = (await Promise.all(
815 // files.map(async (p) => scanContent(p, await readFileFastest(p), ruleset))
816 // )).flat();
817 // ```
818 // What: Build the canonical-path skip set once at startup
819 // (rather than per-file). The set captures the actual
820 // rules file plus the canonical generated-source paths;
821 // empty when none of them resolve. Used only in --all
822 // mode to filter walker output.
823 // Why: Closes BUG 6: explicit positional args are never
824 // skipped; only walker output is filtered, and the filter
825 // is path-anchored (not basename-anchored), so
826 // `sub/forbidden-strings.local.txt` no longer collides
827 // with the actual rules file path.
828 // TS map: `const skipSet = buildSkipSet(rulesPath);`.
829 //
830 // In TS you'd write (pseudocode):
831 // ```ts
832 // const skipSet = buildSkipSet(rulesPath);
833 // ```
834 let skip_set = if all { build_skip_set(&rules_path) } else { std::collections::HashSet::new() };
835
836 let hits: Vec<String> = files
837 .par_iter()
838 .flat_map_iter(|p| {
839 // What: `if all && is_walker_skipped(p, &skip_set) { return Vec::new(); }`.
840 // Only runs the skip check on walker output
841 // (--all mode). For explicit positional args
842 // (`forbidden-strings <path>...` without --all),
843 // the file is ALWAYS scanned -- the user asked.
844 // Why: Closes BUG 6: the previous basename-based skip
845 // hid real findings on
846 // `sub/forbidden-strings.local.txt` and friends
847 // passed as explicit args. The new check applies
848 // only when the walker discovered the file
849 // automatically.
850 //
851 // Inside the conditional, `is_walker_skipped`
852 // canonicalizes the path and compares against
853 // the pre-built skip set. Path-anchored matching
854 // also closes BUG 11 (Windows backslash basename)
855 // by routing through `std::fs::canonicalize`.
856 // TS map: `if (all && isWalkerSkipped(p, skipSet)) return [];`.
857 //
858 // In TS you'd write (pseudocode):
859 // ```ts
860 // if (all && isWalkerSkipped(p, skipSet)) return [];
861 // ```
862 if all && is_walker_skipped(p, &skip_set) {
863 return Vec::new();
864 }
865 // What: `let content = fs::read(p).unwrap_or_default();`.
866 // `fs::read` returns `Result<Vec<u8>, io::Error>`
867 // (the file's raw bytes or an I/O error).
868 // `.unwrap_or_default()` extracts the `Ok` value or
869 // substitutes `Vec::<u8>::default()` (the empty
870 // vec) and SILENTLY DROPS the error. The implicit
871 // inferred type is `Vec<u8>`. Sibling pattern:
872 // `fs::read_to_string` returns `Result<String, _>`
873 // but requires UTF-8 -- we want raw bytes here
874 // because rules scan binary files too.
875 // Why: A file we can't read (permissions, vanished,
876 // etc.) becomes "empty content" and the scan
877 // pass produces zero hits for it. Crashing the
878 // whole walk on one unreadable file is worse.
879 // TS map: `try { content = await readFile(p); } catch { content = new Uint8Array(); }`.
880 // Gotcha: `.unwrap_or_default()` SILENTLY discards the
881 // `io::Error`. We accept that here because the
882 // per-file scan is best-effort.
883 //
884 // In TS you'd write (pseudocode):
885 // ```ts
886 // let content: Uint8Array;
887 // try { content = await readFile(p); }
888 // catch (e) { return [`${p}: read error: ${e.message}`]; }
889 // return scanContent(p, content, ruleset);
890 // ```
891 // What: `match fs::read(p) { Ok(c) => ..., Err(e) => ... }`.
892 // Read error path now emits a synthetic "hit"
893 // string formatted as `{path}: read error: {err}`
894 // instead of silently substituting empty content.
895 // The synthetic entry makes the file appear in the
896 // output report AND keeps the exit code at 1 (hits
897 // non-empty -> ExitCode::from(1) downstream).
898 // Why: Closes BUG 4. Pre-fix, `fs::read(p).unwrap_or_default()`
899 // dropped every io::Error: permissions, missing
900 // file, broken symlink, /proc EACCES, etc. became
901 // "empty content", the scan emitted zero hits, and
902 // the run exited 0. A secret-scanning CI control
903 // must NOT silently pass on unreadable files; the
904 // operator needs to know they had no signal.
905 // TS map: `try { ... } catch (e) { return [makeError(p, e)] }`.
906 //
907 // In TS you'd write (pseudocode):
908 // ```ts
909 // try { content = await readFile(p); }
910 // catch (e) { return [`${p}: read error: ${e}`]; }
911 // ```
912 let content = match read_with_binary_check(p) {
913 Ok(c) => c,
914 Err(e) => {
915 return vec![format!("{}: read error: {}", p, e)];
916 }
917 };
918 // What: `scan_content(p, &content, &ruleset)` is a function
919 // call. `&content` and `&ruleset` are BORROW
920 // expressions: we lend the vec and ruleset to the
921 // callee read-only. The callee returns a fresh
922 // `Vec<String>` of hits which becomes this closure's
923 // tail expression (no `;` -> implicit return).
924 // Why: Hand the just-read bytes to the scanner; the
925 // returned hits become this closure's contribution
926 // to the parallel-flat_map output.
927 // TS map: `return scanContent(p, content, ruleset);`.
928 //
929 // In TS you'd write (pseudocode):
930 // ```ts
931 // return scanContent(p, content, ruleset);
932 // ```
933 scan_content(p, &content, &ruleset)
934 })
935 .collect();
936
937 // What: `std::io::stderr().lock()` returns a `StderrLock`, an
938 // RAII handle holding the stderr mutex. Held writes
939 // don't interleave with other threads.
940 // Why: Print all hits in one batch.
941 // TS map: No equivalent; Node has no stderr lock concept.
942 //
943 // In TS you'd write (pseudocode):
944 // ```ts
945 // for (const h of hits) process.stderr.write(h + "\n");
946 // ```
947 let stderr = std::io::stderr();
948 let mut handle = stderr.lock();
949 for h in &hits {
950 let _ = writeln!(handle, "{}", h);
951 }
952
953 // What: `if hits.is_empty() { Ok(0) } else { Ok(1) }`.
954 // This is an `if`-as-EXPRESSION (not statement) with no
955 // trailing `;`: its value becomes the function's return.
956 // `Ok(0)` and `Ok(1)` construct the success variant of
957 // `Result<i32, String>` with the OS-exit code inside; the
958 // bin wrapper converts to `ExitCode` for the actual exit.
959 // Why: No hits = clean exit; one or more hits = "violation"
960 // exit so CI marks the run as failed.
961 // TS map: `return hits.length === 0 ? 0 : 1;`.
962 //
963 // In TS you'd write (pseudocode):
964 // ```ts
965 // return hits.length === 0 ? 0 : 1;
966 // ```
967 if hits.is_empty() {
968 Ok(0)
969 } else {
970 Ok(1)
971 }
972}