keyhog_scanner/engine/scan.rs
1// `scan_filters` is consumed in the `feature = "simd"` arm below (the
2// trigger-bitmap / fallback path). Lean builds compile that arm out, so
3// gate the glob to match — otherwise rustc warns about an unused import.
4#[cfg(feature = "simd")]
5use super::scan_filters::*;
6use super::*;
7#[cfg(feature = "simd")]
8use std::cell::RefCell;
9
10// The trigger-buffer pool is only used in the Hyperscan-prefilter
11// scratch path of `scan_coalesced` (gated `#[cfg(feature = "simd")]`).
12// Without `simd`, both the pool and the helper become dead code,
13// so gate them too - otherwise `cargo build --no-default-features`
14// (the no-Hyperscan Windows build) emits dead-code warnings.
15//
16// Note: a previous attempt extended this pool to the per-chunk
17// `collect_triggered_patterns_*` builders. That regressed the
18// long-lines bench by ~12% because those builders return
19// `Vec<u64>` to their callers - the pool can't save the
20// allocation, only adds the thread_local + RefCell overhead.
21// The pool's win is reuse of buffers that stay inside the pool.
22#[cfg(feature = "simd")]
23thread_local! {
24 /// Per-thread pool of trigger-bitmask vectors. Phase-1 of `scan_coalesced`
25 /// allocates one `Vec<u64>` of size `ac_len.div_ceil(64)` per chunk. On a
26 /// 100k-file scan with 1500 patterns that's ~2.4M tiny allocations
27 /// hammering the global allocator. With this pool, each rayon worker
28 /// reuses a single buffer across all the chunks it processes.
29 static TRIGGER_POOL: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) };
30}
31
32#[cfg(feature = "simd")]
33#[inline]
34fn with_trigger_buffer<R>(words_needed: usize, f: impl FnOnce(&mut [u64]) -> R) -> R {
35 TRIGGER_POOL.with(|cell| {
36 let mut buf = cell.borrow_mut();
37 if buf.len() < words_needed {
38 buf.resize(words_needed, 0);
39 }
40 let slice = &mut buf[..words_needed];
41 slice.fill(0);
42 f(slice)
43 })
44}
45
46/// Compute the two per-pattern-constant confidence signals.
47/// Extracted so both `extract_grouped_matches` and
48/// `extract_plain_matches` share the same lazy `OnceCell` init
49/// closure body (Rust can't `impl FnOnce<>` to share inline).
50/// `pub(super)` so the extract submodule (`engine/extract.rs`) can call
51/// it after the scan.rs / extract.rs / process.rs split.
52pub(super) fn compute_pattern_signals(detector: &DetectorSpec, chunk: &Chunk) -> (bool, bool) {
53 let kw = detector
54 .keywords
55 .iter()
56 .any(|keyword| chunk.data.contains(keyword.as_str()));
57 let sf = chunk
58 .metadata
59 .path
60 .as_deref()
61 .map(crate::confidence::is_sensitive_path)
62 .unwrap_or(false);
63 (kw, sf)
64}
65
66impl CompiledScanner {
67 /// High-throughput coalesced scan: all files scanned in parallel,
68 /// zero overhead for non-hit files.
69 ///
70 /// Architecture:
71 /// Phase 1: Parallel HS prefilter on raw bytes (no prep, no alloc)
72 /// Phase 2: Full extraction only on hit files (~5% of total)
73 #[allow(clippy::needless_return)] // return needed under non-simd cfg branch
74 pub fn scan_coalesced(&self, chunks: &[keyhog_core::Chunk]) -> Vec<Vec<keyhog_core::RawMatch>> {
75 #[cfg(feature = "simd")]
76 use crate::hw_probe::ScanBackend;
77 use rayon::prelude::*;
78
79 #[cfg(not(feature = "simd"))]
80 {
81 // Parallel CPU dispatch - same reasoning as scan_chunks_with_backend:
82 // the per-chunk scan is independent and CPU-bound.
83 let mut results: Vec<Vec<keyhog_core::RawMatch>> =
84 chunks.par_iter().map(|c| self.scan(c)).collect();
85 super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
86 return results;
87 }
88
89 #[cfg(feature = "simd")]
90 {
91 let Some(scanner) = &self.simd_prefilter else {
92 // Hyperscan failed to initialize at compile time - fall back
93 // to per-chunk parallel SimdCpu (or whichever backend the
94 // scanner picks), then preserve cross-window boundary recall.
95 let mut results: Vec<Vec<keyhog_core::RawMatch>> =
96 chunks.par_iter().map(|c| self.scan(c)).collect();
97 super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
98 return results;
99 };
100
101 let ac_len = self.ac_map.len();
102
103 // Phase 1: Parallel HS scan on RAW bytes. No prepare, no Arc, no alloc
104 // for non-hit files. Thread-local scratch + a per-worker bitmask
105 // POOL eliminate the per-chunk `vec![0u64; …]` alloc - we still
106 // need owned Vecs in the result so phase 2 can consume them, but
107 // empty-result chunks return `None` and skip the alloc entirely.
108 let words_needed = ac_len.div_ceil(64);
109 let _p1 = std::time::Instant::now();
110 let triggers: Vec<Option<Vec<u64>>> = chunks
111 .par_iter()
112 .map(|chunk| {
113 let data = chunk.data.as_bytes();
114 // Cheap O(n) content prefilters before the Hyperscan
115 // automaton walk. mod.rs's per-chunk entry point screens
116 // with these (alphabet set + bigram bloom) but the
117 // coalesced phase-1 path historically fed every chunk's
118 // raw bytes straight into the much heavier
119 // `scanner.scan`. On a source-heavy monorepo the bloom
120 // (a single 4096-bit pass) rejects the majority of files
121 // that carry no detector literal-prefix, eliding the
122 // Hyperscan scratch scan on them. Same gates, same
123 // ordering, and the same `>= 64`-byte bloom guard as
124 // mod.rs so behaviour is identical to the non-coalesced
125 // path. A rejected chunk returns `None` (no trigger),
126 // which routes phase 2 down the keyword/entropy fallback
127 // branch exactly as a genuine no-HS-hit chunk would.
128 let alphabet_rejected = self
129 .alphabet_screen
130 .as_ref()
131 .is_some_and(|screen| !screen.screen(data));
132 if alphabet_rejected
133 || (data.len() >= 64 && !self.bigram_bloom.maybe_overlaps(data))
134 {
135 return None;
136 }
137 with_trigger_buffer(words_needed, |scratch| {
138 for (hs_id, _start, _end) in scanner.scan(data) {
139 let Some((_det, dedup_id, _grp)) = scanner.pattern_info(hs_id) else {
140 continue;
141 };
142 if let Some(orig) = self.hs_index_map.get(dedup_id) {
143 for &idx in orig {
144 let idx = idx as usize;
145 if idx < ac_len {
146 scratch[idx / 64] |= 1u64 << (idx % 64);
147 }
148 }
149 }
150 }
151 if scratch.iter().any(|&w| w != 0) {
152 Some(scratch.to_vec())
153 } else {
154 None
155 }
156 })
157 })
158 .collect();
159 let _p1e = _p1.elapsed();
160
161 // The phase-1 telemetry is purely a tracing::info! line, which
162 // is off at the default log level. `total_hs_matches` is a full
163 // popcount pass over every word of every hit bitmap; computing
164 // it unconditionally is O(total_words) of dead work per batch
165 // when info logging is disabled. Gate the whole summary (and its
166 // hit_count walk) behind an enabled check so the default path
167 // pays nothing.
168 if tracing::enabled!(tracing::Level::INFO) {
169 let hit_count = triggers.iter().filter(|t| t.is_some()).count();
170 let total_hs_matches: usize = triggers
171 .iter()
172 .filter_map(|t| t.as_ref())
173 .map(|t| t.iter().map(|w| w.count_ones() as usize).sum::<usize>())
174 .sum();
175 tracing::info!(
176 files = chunks.len(),
177 hits = hit_count,
178 hs_matches = total_hs_matches,
179 "coalesced scan phase 1 complete"
180 );
181 }
182
183 // Phase 2: Full extraction on hit files + multiline fallback (parallel).
184 let _p2 = std::time::Instant::now();
185 let mut results: Vec<Vec<keyhog_core::RawMatch>> = chunks
186 .par_iter()
187 .zip(triggers.into_par_iter())
188 .map(|(chunk, triggered_opt)| {
189 if let Some(triggered) = triggered_opt {
190 let prepared = self.prepare_chunk(chunk);
191 return self.scan_prepared_with_triggered(
192 prepared,
193 ScanBackend::SimdCpu,
194 triggered,
195 None,
196 );
197 }
198 // Multiline fallback: files with concatenation indicators AND
199 // secret-related keywords may contain secrets split across lines
200 // that HS can't match on raw bytes. Only scan these selectively.
201 #[cfg(feature = "multiline")]
202 if crate::multiline::has_concatenation_indicators(&chunk.data)
203 && has_secret_keyword_fast(chunk.data.as_bytes())
204 {
205 let prepared = self.prepare_chunk(chunk);
206 if prepared.preprocessed.text.as_bytes() != chunk.data.as_bytes() {
207 let triggered = self.collect_triggered_patterns_for_backend(
208 &prepared.preprocessed.text,
209 ScanBackend::SimdCpu,
210 );
211 let mut matches = self.scan_prepared_with_triggered(
212 prepared,
213 ScanBackend::SimdCpu,
214 triggered,
215 None,
216 );
217 self.record_and_reassemble_for_no_hit_chunk(chunk, &mut matches);
218 return matches;
219 }
220 }
221
222 // Task #69 follow-up: scan_fallback_patterns runs the
223 // keyword-AC-gated prefix-less detectors (kubernetes-
224 // bootstrap-token, asana-pat, mailchimp #3, ...). The
225 // SIMD-hit branch above routes through that call via
226 // scan_prepared_with_triggered; this no-hit branch
227 // historically only ran scan_generic_assignments, so
228 // any chunk WITHOUT a literal-prefix HS hit silently
229 // dropped every fallback detector - including
230 // standalone-on-a-line k8s bootstrap tokens. Fix:
231 // for chunks that plausibly carry a secret (have a
232 // generic-assignment-keyword OR an explicit secret-
233 // prefix substring like ghp_/sk-proj-/etc.) drive
234 // scan_prepared_with_triggered directly with an empty
235 // trigger bitmap (reusing phase 1's HS result rather
236 // than re-running the automaton), which walks
237 // scan_fallback_patterns → scan_generic_assignments
238 // → scan_entropy_fallback.
239 //
240 // Bound on plausibility: pure source-code files
241 // without any secret-related keyword stay on the
242 // Vec::new() fast path so the per-chunk prepare +
243 // re-Hyperscan cost doesn't regress monorepo scans
244 // (gitlabhq: 64k mostly-source files would otherwise
245 // pay 64k * ~150µs per-chunk fallback walks). The
246 // gate is intentionally permissive - `token`,
247 // `password`, `secret`, `api_key` cover every config
248 // file shape that planted-credential corpora use.
249 //
250 // Cap stays at 32 KB to match the previous
251 // generic-assignment cap: large source files
252 // (>32 KB) are almost never config and the per-file
253 // fallback walk on Go/Java/Python framework code is
254 // dead work.
255 // Third gate (added 2026-05-29): chunks containing a
256 // contiguous base62 run >= 32 chars - the
257 // generic-high-entropy-string corpus shape (a bare
258 // entropy token with NO keyword anchor). Without
259 // this, that category sat at recall 0.36 on the
260 // SecretBench mirror; the entropy fallback never
261 // saw the chunk because no keyword admitted it.
262 //
263 // Keep this gate aligned with scan_entropy_fallback's
264 // own path/config admission. A high-entropy run inside
265 // `src/*.rs` cannot produce an entropy finding when
266 // `entropy_in_source_files=false`, so admitting that
267 // chunk only pays prepare/fallback/generic work before
268 // entropy immediately returns.
269 let data = chunk.data.as_bytes();
270 let entropy_admits = self.config.entropy_enabled
271 && crate::entropy::is_entropy_appropriate(
272 chunk.metadata.path.as_deref(),
273 self.config.entropy_in_source_files,
274 )
275 && has_high_entropy_run_fast(data);
276 if chunk.data.len() <= 32 * 1024
277 && (has_generic_assignment_keyword(data)
278 || has_secret_keyword_fast(data)
279 || entropy_admits)
280 {
281 // KH perf: this is a no-HS-hit chunk - phase 1
282 // already ran the Hyperscan automaton over these
283 // bytes and found no literal-prefix hit (the empty
284 // trigger bitmap was discarded as `None`). Calling
285 // `scan_inner` here would call
286 // `collect_triggered_patterns_for_backend` ->
287 // `collect_triggered_patterns_simd`, which runs the
288 // FULL Hyperscan automaton a SECOND time over the
289 // same bytes for a result we already know is empty.
290 // Reuse the phase-1 result instead: prepare the
291 // chunk and drive `scan_prepared_with_triggered`
292 // directly with an EMPTY trigger bitmap. The
293 // confirmed-pattern extraction is correctly skipped
294 // (no AC pattern fired); the keyword-AC fallback,
295 // generic-assignment, and entropy stages run off
296 // `code_lines` / preprocessed text and need no HS
297 // pass - which is exactly the work this branch
298 // wants. Saves one full Hyperscan walk per
299 // keyworded no-hit file.
300 let prepared = self.prepare_chunk(chunk);
301 let triggered =
302 if prepared.preprocessed.text.as_bytes() == chunk.data.as_bytes() {
303 Vec::new()
304 } else {
305 // Phase 1 scanned raw bytes. Structured
306 // preprocessors append decoded/configured
307 // credential lines, so a no-hit raw chunk can
308 // still contain named-detector literal roots in
309 // the preprocessed text. Recollect only on that
310 // rare drift path and keep the raw no-hit fast
311 // path allocation-free.
312 self.collect_triggered_patterns_for_backend(
313 &prepared.preprocessed.text,
314 ScanBackend::SimdCpu,
315 )
316 };
317 let mut matches = self.scan_prepared_with_triggered(
318 prepared,
319 ScanBackend::SimdCpu,
320 triggered,
321 None,
322 );
323 // Preserve cross-file fragment reassembly that
324 // the previous no-hit branch did. The fragment
325 // cache is mostly populated by named-detector
326 // matches that scan_inner now produces (e.g.
327 // an `AWS_ACCESS_KEY=` match in one .env file
328 // gets recorded for subsequent reassembly with an
329 // `AWS_SECRET=` match in another).
330 self.record_and_reassemble_for_no_hit_chunk(chunk, &mut matches);
331 return matches;
332 }
333
334 Vec::new()
335 })
336 .collect();
337
338 let _p2e = _p2.elapsed();
339 // Cross-chunk reassembly: synthesize a thin boundary buffer
340 // from the tail of each chunk + head of its right neighbour
341 // (same file, gapless) and scan it. Catches secrets split
342 // across the 64 MiB scan-window boundary that in-chunk scan
343 // can't see.
344 let _bt = std::time::Instant::now();
345 super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
346 if std::env::var_os("KH_PERF").is_some() {
347 eprintln!(
348 "KH_PERF scan_coalesced: chunks={} p1={:.3}s p2={:.3}s boundary={:.3}s",
349 chunks.len(),
350 _p1e.as_secs_f64(),
351 _p2e.as_secs_f64(),
352 _bt.elapsed().as_secs_f64()
353 );
354 }
355 results
356 } // #[cfg(feature = "simd")] block
357 } // scan_coalesced
358
359 pub(crate) fn scan_inner(
360 &self,
361 chunk: &Chunk,
362 backend: crate::hw_probe::ScanBackend,
363 deadline: Option<std::time::Instant>,
364 ) -> Vec<RawMatch> {
365 // KH-116: Record scan metrics atomically
366 crate::telemetry::record_file_scanned(chunk.data.len());
367 if backend == crate::hw_probe::ScanBackend::Gpu
368 || backend == crate::hw_probe::ScanBackend::MegaScan
369 {
370 crate::telemetry::record_gpu_dispatch();
371 }
372 let prepared = self.prepare_chunk(chunk);
373 let triggered =
374 self.collect_triggered_patterns_for_backend(&prepared.preprocessed.text, backend);
375 self.scan_prepared_with_triggered(prepared, backend, triggered, deadline)
376 }
377
378 /// Record each match as a SecretFragment in the cross-file
379 /// reassembly cache and scan any reassembled candidates. Lifted
380 /// from the inline no-hit branch in scan_coalesced when that branch
381 /// was rerouted through scan_inner: scan_inner produces the matches,
382 /// and this helper continues the previous fragment-cache flow on
383 /// top of them so monorepo scans still pair AWS_ACCESS_KEY in one
384 /// .env with AWS_SECRET in another.
385 #[cfg(feature = "simd")]
386 fn record_and_reassemble_for_no_hit_chunk(&self, chunk: &Chunk, matches: &mut Vec<RawMatch>) {
387 if matches.is_empty() {
388 return;
389 }
390 // Fast plausibility gate before paying three String allocs per
391 // match (prefix/var_name/value) and the sharded fragment-cache
392 // mutex per record. Cross-file reassembly only fires for fragments
393 // that carry assignment-like syntax (a `=`/`:` plus a quote, the
394 // `var = "value"` shape the fragment cache pairs on). A chunk with
395 // no such syntax cannot contribute a poolable fragment, so the
396 // record + lock + reassemble work is dead. Mirrors the
397 // `has_fragment_assignment_syntax` check in scan_postprocess.rs;
398 // inlined here (it is private to that module) to keep this on a
399 // single cheap memchr pass.
400 let data = chunk.data.as_bytes();
401 let has_assignment =
402 memchr::memchr(b'=', data).is_some() || memchr::memchr(b':', data).is_some();
403 let has_quote = memchr::memchr(b'"', data).is_some()
404 || memchr::memchr(b'\'', data).is_some()
405 || memchr::memchr(b'`', data).is_some();
406 if !(has_assignment && has_quote) {
407 return;
408 }
409 // KH-01: Pre-allocate raw match output vectors with a capacity of 16 entries to avoid resizing
410 let mut reassembled_candidates = Vec::with_capacity(16);
411 // Pre-allocate the path Arc once per chunk: every match in a
412 // single chunk shares the same path, so cloning an Arc<str>
413 // reference is cheaper than cloning the owned String per-match.
414 let path_arc: Option<std::sync::Arc<str>> = chunk
415 .metadata
416 .path
417 .as_deref()
418 .map(std::sync::Arc::<str>::from);
419 if matches.capacity() < matches.len() + 16 {
420 matches.reserve(16);
421 }
422 for m in matches.iter() {
423 if let Some(path) = path_arc.as_ref() {
424 let fragment = crate::fragment_cache::SecretFragment {
425 prefix: m.detector_id.to_string(),
426 var_name: m.detector_name.to_string(),
427 value: zeroize::Zeroizing::new(m.credential.to_string()),
428 line: m.location.line.unwrap_or(0),
429 path: Some(std::sync::Arc::clone(path)),
430 };
431 // Stamped variant: cross-file pooling is impossible now
432 // (scoped_key keys on the full path), and each candidate
433 // carries the anchor fragment's real path/line so the
434 // synthesized finding is attributed to the contributing
435 // file rather than to the current chunk's metadata.
436 let reassembled = self.fragment_cache.record_and_reassemble_stamped(fragment);
437 reassembled_candidates.extend(reassembled);
438 }
439 }
440 for candidate in reassembled_candidates {
441 // candidate.value is Zeroizing<String> - scrubbed when this
442 // iteration ends.
443 let entropy = crate::pipeline::match_entropy(candidate.value.as_bytes());
444 if entropy < 3.0 || candidate.value.len() < 16 {
445 continue;
446 }
447 let mut dummy_data = String::with_capacity(candidate.value.len() + 24);
448 dummy_data.push_str("reassembled_key = \"");
449 dummy_data.push_str(candidate.value.as_str());
450 dummy_data.push('"');
451 // Stamp the dummy chunk's metadata from the ANCHOR fragment's
452 // path, not chunk.metadata.clone(): the contributing
453 // fragment may have come from a different file than the chunk
454 // currently being scanned (same coalesced batch). Falling
455 // back to chunk.metadata is only for the shouldn't-happen
456 // case where the anchor lost its path.
457 let mut dummy_metadata = chunk.metadata.clone();
458 if let Some(frag_path) = candidate.path.as_deref() {
459 dummy_metadata.path = Some(frag_path.to_string());
460 }
461 let dummy_chunk = Chunk {
462 data: dummy_data.into(),
463 metadata: dummy_metadata,
464 };
465 // Tiny synthesized chunk; skip GPU unconditionally -
466 // per-dispatch overhead dwarfs the work. Matches the
467 // scan_cross_chunk_fragments rationale.
468 let backend = crate::hw_probe::ScanBackend::SimdCpu;
469 let mut reassembled_matches = self.scan_inner(&dummy_chunk, backend, None);
470 // Point each reassembled finding at the anchor fragment's
471 // real source line so the finding's location matches the file
472 // its metadata now names.
473 for rm in &mut reassembled_matches {
474 rm.location.line = Some(candidate.line);
475 }
476 matches.append(&mut reassembled_matches);
477 }
478 }
479}