keyhog_scanner/engine/scan.rs
1use super::scan_filters::*;
2use super::*;
3#[cfg(feature = "simd")]
4use std::cell::RefCell;
5
6// The trigger-buffer pool is only used in the Hyperscan-prefilter
7// scratch path of `scan_coalesced` (gated `#[cfg(feature = "simd")]`).
8// Without `simd`, both the pool and the helper become dead code,
9// so gate them too - otherwise `cargo build --no-default-features`
10// (the no-Hyperscan Windows build) emits dead-code warnings.
11//
12// Note: a previous attempt extended this pool to the per-chunk
13// `collect_triggered_patterns_*` builders. That regressed the
14// long-lines bench by ~12% because those builders return
15// `Vec<u64>` to their callers - the pool can't save the
16// allocation, only adds the thread_local + RefCell overhead.
17// The pool's win is reuse of buffers that stay inside the pool.
18#[cfg(feature = "simd")]
19thread_local! {
20 /// Per-thread pool of trigger-bitmask vectors. Phase-1 of `scan_coalesced`
21 /// allocates one `Vec<u64>` of size `ac_len.div_ceil(64)` per chunk. On a
22 /// 100k-file scan with 1500 patterns that's ~2.4M tiny allocations
23 /// hammering the global allocator. With this pool, each rayon worker
24 /// reuses a single buffer across all the chunks it processes.
25 static TRIGGER_POOL: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) };
26}
27
28#[cfg(feature = "simd")]
29#[inline]
30fn with_trigger_buffer<R>(words_needed: usize, f: impl FnOnce(&mut [u64]) -> R) -> R {
31 TRIGGER_POOL.with(|cell| {
32 let mut buf = cell.borrow_mut();
33 if buf.len() < words_needed {
34 buf.resize(words_needed, 0);
35 }
36 let slice = &mut buf[..words_needed];
37 slice.fill(0);
38 f(slice)
39 })
40}
41
42/// Compute the two per-pattern-constant confidence signals.
43/// Extracted so both `extract_grouped_matches` and
44/// `extract_plain_matches` share the same lazy `OnceCell` init
45/// closure body (Rust can't `impl FnOnce<>` to share inline).
46/// `pub(super)` so the extract submodule (`engine/extract.rs`) can call
47/// it after the scan.rs / extract.rs / process.rs split.
48pub(super) fn compute_pattern_signals(detector: &DetectorSpec, chunk: &Chunk) -> (bool, bool) {
49 let kw = detector
50 .keywords
51 .iter()
52 .any(|keyword| chunk.data.contains(keyword.as_str()));
53 let sf = chunk
54 .metadata
55 .path
56 .as_deref()
57 .map(crate::confidence::is_sensitive_path)
58 .unwrap_or(false);
59 (kw, sf)
60}
61
62impl CompiledScanner {
63 /// High-throughput coalesced scan: all files scanned in parallel,
64 /// zero overhead for non-hit files.
65 ///
66 /// Architecture:
67 /// Phase 1: Parallel HS prefilter on raw bytes (no prep, no alloc)
68 /// Phase 2: Full extraction only on hit files (~5% of total)
69 #[allow(clippy::needless_return)] // return needed under non-simd cfg branch
70 pub fn scan_coalesced(&self, chunks: &[keyhog_core::Chunk]) -> Vec<Vec<keyhog_core::RawMatch>> {
71 #[cfg(feature = "simd")]
72 use crate::hw_probe::ScanBackend;
73 use rayon::prelude::*;
74
75 #[cfg(not(feature = "simd"))]
76 {
77 // Parallel CPU dispatch - same reasoning as scan_chunks_with_backend:
78 // the per-chunk scan is independent and CPU-bound.
79 let mut results: Vec<Vec<keyhog_core::RawMatch>> =
80 chunks.par_iter().map(|c| self.scan(c)).collect();
81 super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
82 return results;
83 }
84
85 #[cfg(feature = "simd")]
86 {
87 let Some(scanner) = &self.simd_prefilter else {
88 // Hyperscan failed to initialize at compile time - fall back
89 // to per-chunk parallel SimdCpu (or whichever backend the
90 // scanner picks). Was serial; now uses rayon.
91 return chunks.par_iter().map(|c| self.scan(c)).collect();
92 };
93
94 let ac_len = self.ac_map.len();
95
96 // Phase 1: Parallel HS scan on RAW bytes. No prepare, no Arc, no alloc
97 // for non-hit files. Thread-local scratch + a per-worker bitmask
98 // POOL eliminate the per-chunk `vec![0u64; …]` alloc - we still
99 // need owned Vecs in the result so phase 2 can consume them, but
100 // empty-result chunks return `None` and skip the alloc entirely.
101 let words_needed = ac_len.div_ceil(64);
102 let triggers: Vec<Option<Vec<u64>>> = chunks
103 .par_iter()
104 .map(|chunk| {
105 let data = chunk.data.as_bytes();
106 with_trigger_buffer(words_needed, |scratch| {
107 for (hs_id, _start, _end) in scanner.scan(data) {
108 let Some((_det, dedup_id, _grp)) = scanner.pattern_info(hs_id) else {
109 continue;
110 };
111 if let Some(orig) = self.hs_index_map.get(dedup_id) {
112 for &idx in orig {
113 if idx < ac_len {
114 scratch[idx / 64] |= 1u64 << (idx % 64);
115 }
116 }
117 }
118 }
119 if scratch.iter().any(|&w| w != 0) {
120 Some(scratch.to_vec())
121 } else {
122 None
123 }
124 })
125 })
126 .collect();
127
128 let hit_count = triggers.iter().filter(|t| t.is_some()).count();
129 let total_hs_matches: usize = triggers
130 .iter()
131 .filter_map(|t| t.as_ref())
132 .map(|t| t.iter().map(|w| w.count_ones() as usize).sum::<usize>())
133 .sum();
134 tracing::info!(
135 files = chunks.len(),
136 hits = hit_count,
137 hs_matches = total_hs_matches,
138 "coalesced scan phase 1 complete"
139 );
140
141 // Phase 2: Full extraction on hit files + multiline fallback (parallel).
142 let mut results: Vec<Vec<keyhog_core::RawMatch>> = chunks
143 .par_iter()
144 .zip(triggers.into_par_iter())
145 .map(|(chunk, triggered_opt)| {
146 if let Some(triggered) = triggered_opt {
147 let prepared = self.prepare_chunk(chunk);
148 return self.scan_prepared_with_triggered(
149 prepared,
150 ScanBackend::SimdCpu,
151 triggered,
152 None,
153 );
154 }
155 // Multiline fallback: files with concatenation indicators AND
156 // secret-related keywords may contain secrets split across lines
157 // that HS can't match on raw bytes. Only scan these selectively.
158 #[cfg(feature = "multiline")]
159 if crate::multiline::has_concatenation_indicators(&chunk.data)
160 && has_secret_keyword_fast(chunk.data.as_bytes())
161 {
162 return self.scan(chunk);
163 }
164
165 // Task #69 follow-up: scan_fallback_patterns runs the
166 // keyword-AC-gated prefix-less detectors (kubernetes-
167 // bootstrap-token, asana-pat, mailchimp #3, ...). The
168 // SIMD-hit branch above routes through that call via
169 // scan_prepared_with_triggered; this no-hit branch
170 // historically only ran scan_generic_assignments, so
171 // any chunk WITHOUT a literal-prefix HS hit silently
172 // dropped every fallback detector - including
173 // standalone-on-a-line k8s bootstrap tokens. Fix:
174 // for chunks that plausibly carry a secret (have a
175 // generic-assignment-keyword OR an explicit secret-
176 // prefix substring like ghp_/sk-proj-/etc.) route
177 // through scan_inner, which walks
178 // scan_prepared_with_triggered → scan_fallback_patterns
179 // → scan_generic_assignments → scan_entropy_fallback.
180 //
181 // Bound on plausibility: pure source-code files
182 // without any secret-related keyword stay on the
183 // Vec::new() fast path so the per-chunk prepare +
184 // re-Hyperscan cost doesn't regress monorepo scans
185 // (gitlabhq: 64k mostly-source files would otherwise
186 // pay 64k * ~150µs per-chunk fallback walks). The
187 // gate is intentionally permissive - `token`,
188 // `password`, `secret`, `api_key` cover every config
189 // file shape that planted-credential corpora use.
190 //
191 // Cap stays at 32 KB to match the previous
192 // generic-assignment cap: large source files
193 // (>32 KB) are almost never config and the per-file
194 // fallback walk on Go/Java/Python framework code is
195 // dead work.
196 // Third gate (added 2026-05-29): chunks containing a
197 // contiguous base62 run >= 32 chars - the
198 // generic-high-entropy-string corpus shape (a bare
199 // entropy token with NO keyword anchor). Without
200 // this, that category sat at recall 0.36 on the
201 // SecretBench mirror; the entropy fallback never
202 // saw the chunk because no keyword admitted it.
203 // Hash/UUID FPs are still suppressed downstream by
204 // looks_like_hash_digest / is_uuid_v4_shape, so the
205 // wider gate trades pipeline cost for recall, not
206 // FPs. Cost cap stays at 32 KB so monorepo scans
207 // (gitlabhq, etc.) don't pay per-chunk fallback
208 // walks on >32 KB source files.
209 if chunk.data.len() <= 32 * 1024
210 && (has_generic_assignment_keyword(chunk.data.as_bytes())
211 || has_secret_keyword_fast(chunk.data.as_bytes())
212 || has_high_entropy_run_fast(chunk.data.as_bytes()))
213 {
214 let mut matches = self.scan_inner(chunk, ScanBackend::SimdCpu, None);
215 // KH-01: Pre-allocate raw match output vectors with a capacity of 16 entries to avoid resizing
216 if matches.capacity() < 16 {
217 matches.reserve(16 - matches.len());
218 }
219 // Preserve cross-file fragment reassembly that
220 // the previous no-hit branch did. The fragment
221 // cache is mostly populated by named-detector
222 // matches that scan_inner now produces (e.g.
223 // an `AWS_ACCESS_KEY=` match in one .env file
224 // gets recorded for later reassembly with an
225 // `AWS_SECRET=` match in another).
226 self.record_and_reassemble_for_no_hit_chunk(chunk, &mut matches);
227 return matches;
228 }
229
230 Vec::new()
231 })
232 .collect();
233
234 // Cross-chunk reassembly: synthesize a thin boundary buffer
235 // from the tail of each chunk + head of its right neighbour
236 // (same file, gapless) and scan it. Catches secrets split
237 // across the 64 MiB scan-window boundary that in-chunk scan
238 // can't see.
239 super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
240 results
241 } // #[cfg(feature = "simd")] block
242 } // scan_coalesced
243
244 pub(crate) fn scan_inner(
245 &self,
246 chunk: &Chunk,
247 backend: crate::hw_probe::ScanBackend,
248 deadline: Option<std::time::Instant>,
249 ) -> Vec<RawMatch> {
250 // KH-116: Record scan metrics atomically
251 crate::telemetry::record_file_scanned(chunk.data.len());
252 if backend == crate::hw_probe::ScanBackend::Gpu
253 || backend == crate::hw_probe::ScanBackend::MegaScan
254 {
255 crate::telemetry::record_gpu_dispatch();
256 }
257 let prepared = self.prepare_chunk(chunk);
258 let triggered =
259 self.collect_triggered_patterns_for_backend(&prepared.preprocessed.text, backend);
260 self.scan_prepared_with_triggered(prepared, backend, triggered, deadline)
261 }
262
263 /// Record each match as a SecretFragment in the cross-file
264 /// reassembly cache and scan any reassembled candidates. Lifted
265 /// from the inline no-hit branch in scan_coalesced when that branch
266 /// was rerouted through scan_inner: scan_inner produces the matches,
267 /// and this helper continues the previous fragment-cache flow on
268 /// top of them so monorepo scans still pair AWS_ACCESS_KEY in one
269 /// .env with AWS_SECRET in another.
270 #[cfg(feature = "simd")]
271 fn record_and_reassemble_for_no_hit_chunk(&self, chunk: &Chunk, matches: &mut Vec<RawMatch>) {
272 // KH-01: Pre-allocate raw match output vectors with a capacity of 16 entries to avoid resizing
273 let mut reassembled_candidates = Vec::with_capacity(16);
274 // Pre-allocate the path Arc once per chunk: every match in a
275 // single chunk shares the same path, so cloning an Arc<str>
276 // reference is cheaper than cloning the owned String per-match.
277 let path_arc: Option<std::sync::Arc<str>> = chunk
278 .metadata
279 .path
280 .as_deref()
281 .map(std::sync::Arc::<str>::from);
282 if matches.capacity() < matches.len() + 16 {
283 matches.reserve(16);
284 }
285 for m in matches.iter() {
286 if let Some(path) = path_arc.as_ref() {
287 let fragment = crate::fragment_cache::SecretFragment {
288 prefix: m.detector_id.to_string(),
289 var_name: m.detector_name.to_string(),
290 value: zeroize::Zeroizing::new(m.credential.to_string()),
291 line: m.location.line.unwrap_or(0),
292 path: Some(std::sync::Arc::clone(path)),
293 };
294 let reassembled = self.fragment_cache.record_and_reassemble(fragment);
295 reassembled_candidates.extend(reassembled);
296 }
297 }
298 for candidate in reassembled_candidates {
299 // candidate is Zeroizing<String> - scrubbed when this
300 // iteration ends.
301 let entropy = crate::pipeline::match_entropy(candidate.as_bytes());
302 if entropy < 3.0 || candidate.len() < 16 {
303 continue;
304 }
305 let mut dummy_data = String::with_capacity(candidate.len() + 24);
306 dummy_data.push_str("reassembled_key = \"");
307 dummy_data.push_str(candidate.as_str());
308 dummy_data.push('"');
309 let dummy_chunk = Chunk {
310 data: dummy_data.into(),
311 metadata: chunk.metadata.clone(),
312 };
313 // Tiny synthesized chunk; skip GPU unconditionally -
314 // per-dispatch overhead dwarfs the work. Matches the
315 // scan_cross_chunk_fragments rationale.
316 let backend = crate::hw_probe::ScanBackend::SimdCpu;
317 let mut reassembled_matches = self.scan_inner(&dummy_chunk, backend, None);
318 matches.append(&mut reassembled_matches);
319 }
320 }
321}